249 lines
7.8 KiB
C
249 lines
7.8 KiB
C
#include "../include/syntax_highlighter.h"
|
|
#include "../include/data.h"
|
|
#include "../include/utf8.h"
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
|
|
extern struct editorConfig E;
|
|
|
|
const char *c_keywords[] = {
|
|
"if", "else", "while", "for", "do", "switch",
|
|
"case", "break", "#include", "#define", "#if", "#endif", "#ifndef", "continue", "return",
|
|
"goto", "struct", "union", "enum", "typedef", "static",
|
|
"extern", "const", "volatile", "sizeof", "auto", "register",
|
|
"inline", "restrict", NULL};
|
|
|
|
// C types
|
|
const char *c_types[] = {"int", "char", "float", "double",
|
|
"void", "long", "short", "unsigned",
|
|
"signed", "bool", NULL};
|
|
|
|
// Returns the byte length of the UTF-8 character starting at s.
|
|
// Never returns 0 for a non-NUL byte, so callers won't infinite-loop.
|
|
static int utf8_char_len(const char *s)
|
|
{
|
|
unsigned char c = (unsigned char)*s;
|
|
if (c == 0) return 0;
|
|
if (c < 0x80) return 1;
|
|
if ((c & 0xE0) == 0xC0) return 2;
|
|
if ((c & 0xF0) == 0xE0) return 3;
|
|
if ((c & 0xF8) == 0xF0) return 4;
|
|
return 1; // continuation byte or invalid — advance 1 to avoid infinite loop
|
|
}
|
|
|
|
int is_word_char(const char *s)
|
|
{
|
|
uint32_t cp = utf8Decode(&s);
|
|
|
|
if ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z') ||
|
|
(cp >= '0' && cp <= '9') || cp == '_' || cp == '#')
|
|
return 1;
|
|
|
|
if (cp == 0xFFFD) return 0;
|
|
|
|
if (cp >= 0x00C0 && cp <= 0x017F) return 1;
|
|
if (cp >= 0x0370 && cp <= 0x03FF) return 1;
|
|
if (cp >= 0x0400 && cp <= 0x04FF) return 1;
|
|
if (cp >= 0x0600 && cp <= 0x06FF) return 1;
|
|
if (cp >= 0x05D0 && cp <= 0x05EA) return 1;
|
|
if (cp >= 0x0900 && cp <= 0x097F) return 1;
|
|
if (cp >= 0x4E00 && cp <= 0x9FFF) return 1;
|
|
if ((cp >= 0x3040 && cp <= 0x309F) ||
|
|
(cp >= 0x30A0 && cp <= 0x30FF)) return 1;
|
|
if (cp >= 0xAC00 && cp <= 0xD7A3) return 1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
// Copy one full UTF-8 character from src+i into dst+pos, advance both indices.
|
|
static void copy_utf8_char(char *dst, int *dst_pos, const char *src, int *src_pos)
|
|
{
|
|
int len = utf8_char_len(&src[*src_pos]);
|
|
for (int b = 0; b < len; b++)
|
|
dst[(*dst_pos)++] = src[(*src_pos)++];
|
|
}
|
|
|
|
// Check if character is alphanumeric or underscore
|
|
|
|
// Check if string is a keyword
|
|
int is_keyword(const char *word) {
|
|
for (int i = 0; c_keywords[i] != NULL; i++) {
|
|
if (strcmp(word, c_keywords[i]) == 0)
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Check if string is a type
|
|
int is_type(const char *word) {
|
|
for (int i = 0; c_types[i] != NULL; i++) {
|
|
if (strcmp(word, c_types[i]) == 0)
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Get color code for token type
|
|
const char *get_color(TokenType type) {
|
|
switch (type) {
|
|
case TOKEN_KEYWORD:
|
|
return E.theme.COLOR_KEYWORD;
|
|
case TOKEN_TYPE:
|
|
return E.theme.COLOR_TYPE;
|
|
case TOKEN_STRING:
|
|
return E.theme.COLOR_STRING;
|
|
case TOKEN_COMMENT:
|
|
return E.theme.COLOR_COMMENT;
|
|
case TOKEN_NUMBER:
|
|
return E.theme.COLOR_NUMBER;
|
|
default:
|
|
return E.theme.COLOR_DEFAULT;
|
|
}
|
|
}
|
|
|
|
int comment_section = 0;
|
|
|
|
// Highlight a line of C code and return the highlighted string
|
|
// Returns a newly allocated string that must be freed by the caller
|
|
char *highlight_line(const char *line, int *length) {
|
|
// Each byte can expand to at most (color_prefix + 4 bytes + color_reset).
|
|
// Allocate generously based on line length to avoid overflow.
|
|
int line_len = strlen(line);
|
|
int buf_size = line_len * 32 + 256;
|
|
char *result = malloc(buf_size);
|
|
int result_pos = 0;
|
|
int i = 0;
|
|
|
|
while (line[i] != '\0' && line[i] != '\n') {
|
|
// Skip whitespace — copy full UTF-8 char (whitespace is always ASCII,
|
|
// but using copy_utf8_char keeps the pattern consistent)
|
|
if (line[i] == ' ' || line[i] == '\t') {
|
|
copy_utf8_char(result, &result_pos, line, &i);
|
|
continue;
|
|
}
|
|
|
|
if (comment_section) {
|
|
result_pos += sprintf(&result[result_pos], "%s", E.theme.COLOR_COMMENT);
|
|
while (line[i] != '\0' && line[i] != '\n') {
|
|
if (line[i] == '*' && line[i + 1] == '/') {
|
|
comment_section = 0;
|
|
}
|
|
copy_utf8_char(result, &result_pos, line, &i);
|
|
}
|
|
result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET);
|
|
continue;
|
|
}
|
|
|
|
// Handle line comments
|
|
if (line[i] == '/' && line[i + 1] == '/') {
|
|
result_pos += sprintf(&result[result_pos], "%s", E.theme.COLOR_COMMENT);
|
|
while (line[i] != '\0' && line[i] != '\n') {
|
|
copy_utf8_char(result, &result_pos, line, &i);
|
|
}
|
|
result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET);
|
|
continue;
|
|
}
|
|
|
|
// Handle block comments
|
|
if (line[i] == '/' && line[i + 1] == '*') {
|
|
comment_section = 1;
|
|
result_pos += sprintf(&result[result_pos], "%s", E.theme.COLOR_COMMENT);
|
|
result[result_pos++] = line[i++];
|
|
result[result_pos++] = line[i++];
|
|
while (line[i] != '\0') {
|
|
if (line[i] == '*' && line[i + 1] == '/') {
|
|
result[result_pos++] = line[i++];
|
|
result[result_pos++] = line[i++];
|
|
comment_section = 0;
|
|
break;
|
|
}
|
|
copy_utf8_char(result, &result_pos, line, &i);
|
|
}
|
|
result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET);
|
|
continue;
|
|
}
|
|
|
|
// Handle strings
|
|
if (line[i] == '"') {
|
|
result_pos += sprintf(&result[result_pos], "%s\"", E.theme.COLOR_STRING);
|
|
i++;
|
|
while (line[i] != '\0' && line[i] != '"') {
|
|
if (line[i] == '\\') {
|
|
result[result_pos++] = line[i++];
|
|
copy_utf8_char(result, &result_pos, line, &i);
|
|
} else {
|
|
copy_utf8_char(result, &result_pos, line, &i);
|
|
}
|
|
}
|
|
if (line[i] == '"')
|
|
result[result_pos++] = line[i++];
|
|
result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET);
|
|
continue;
|
|
}
|
|
|
|
// Handle character literals
|
|
if (line[i] == '\'') {
|
|
result_pos += sprintf(&result[result_pos], "%s'", E.theme.COLOR_STRING);
|
|
i++;
|
|
while (line[i] != '\0' && line[i] != '\'') {
|
|
if (line[i] == '\\') {
|
|
result[result_pos++] = line[i++];
|
|
copy_utf8_char(result, &result_pos, line, &i);
|
|
} else {
|
|
copy_utf8_char(result, &result_pos, line, &i);
|
|
}
|
|
}
|
|
if (line[i] == '\'')
|
|
result[result_pos++] = line[i++];
|
|
result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET);
|
|
continue;
|
|
}
|
|
|
|
// Handle numbers
|
|
if (line[i] >= '0' && line[i] <= '9') {
|
|
result_pos += sprintf(&result[result_pos], "%s", E.theme.COLOR_NUMBER);
|
|
while (is_word_char(&line[i]) || line[i] == '.') {
|
|
copy_utf8_char(result, &result_pos, line, &i);
|
|
}
|
|
result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET);
|
|
continue;
|
|
}
|
|
|
|
// Handle identifiers and keywords
|
|
if (is_word_char(&line[i])) {
|
|
int start = i;
|
|
// Advance by full UTF-8 character widths, not single bytes
|
|
while (line[i] != '\0' && is_word_char(&line[i]))
|
|
i += utf8_char_len(&line[i]);
|
|
|
|
char word[256];
|
|
int word_len = i - start;
|
|
if (word_len >= (int)sizeof(word))
|
|
word_len = (int)sizeof(word) - 1;
|
|
strncpy(word, &line[start], word_len);
|
|
word[word_len] = '\0';
|
|
|
|
TokenType type = TOKEN_DEFAULT;
|
|
if (is_keyword(word))
|
|
type = TOKEN_KEYWORD;
|
|
else if (is_type(word))
|
|
type = TOKEN_TYPE;
|
|
|
|
result_pos += sprintf(&result[result_pos], "%s%s%s", get_color(type),
|
|
word, COLOR_RESET);
|
|
continue;
|
|
}
|
|
|
|
// Handle operators and other characters (including non-ASCII multi-byte)
|
|
result_pos += sprintf(&result[result_pos], "%s", E.theme.COLOR_DEFAULT);
|
|
copy_utf8_char(result, &result_pos, line, &i);
|
|
result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET);
|
|
}
|
|
|
|
result[result_pos] = '\0';
|
|
*length = result_pos + 1;
|
|
return result;
|
|
}
|