#include "../include/syntax_highlighter.h" #include "../include/data.h" #include "../include/utf8.h" #include #include #include extern struct editorConfig E; const char *c_keywords[] = { "if", "else", "while", "for", "do", "switch", "case", "break", "#include", "#define", "#if", "#endif", "#ifndef", "continue", "return", "goto", "struct", "union", "enum", "typedef", "static", "extern", "const", "volatile", "sizeof", "auto", "register", "inline", "restrict", NULL}; // C types const char *c_types[] = {"int", "char", "float", "double", "void", "long", "short", "unsigned", "signed", "bool", NULL}; // Returns the byte length of the UTF-8 character starting at s. // Never returns 0 for a non-NUL byte, so callers won't infinite-loop. static int utf8_char_len(const char *s) { unsigned char c = (unsigned char)*s; if (c == 0) return 0; if (c < 0x80) return 1; if ((c & 0xE0) == 0xC0) return 2; if ((c & 0xF0) == 0xE0) return 3; if ((c & 0xF8) == 0xF0) return 4; return 1; // continuation byte or invalid — advance 1 to avoid infinite loop } int is_word_char(const char *s) { uint32_t cp = utf8Decode(&s); if ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z') || (cp >= '0' && cp <= '9') || cp == '_' || cp == '#') return 1; if (cp == 0xFFFD) return 0; if (cp >= 0x00C0 && cp <= 0x017F) return 1; if (cp >= 0x0370 && cp <= 0x03FF) return 1; if (cp >= 0x0400 && cp <= 0x04FF) return 1; if (cp >= 0x0600 && cp <= 0x06FF) return 1; if (cp >= 0x05D0 && cp <= 0x05EA) return 1; if (cp >= 0x0900 && cp <= 0x097F) return 1; if (cp >= 0x4E00 && cp <= 0x9FFF) return 1; if ((cp >= 0x3040 && cp <= 0x309F) || (cp >= 0x30A0 && cp <= 0x30FF)) return 1; if (cp >= 0xAC00 && cp <= 0xD7A3) return 1; return 0; } // Copy one full UTF-8 character from src+i into dst+pos, advance both indices. static void copy_utf8_char(char *dst, int *dst_pos, const char *src, int *src_pos) { int len = utf8_char_len(&src[*src_pos]); for (int b = 0; b < len; b++) dst[(*dst_pos)++] = src[(*src_pos)++]; } // Check if character is alphanumeric or underscore // Check if string is a keyword int is_keyword(const char *word) { for (int i = 0; c_keywords[i] != NULL; i++) { if (strcmp(word, c_keywords[i]) == 0) return 1; } return 0; } // Check if string is a type int is_type(const char *word) { for (int i = 0; c_types[i] != NULL; i++) { if (strcmp(word, c_types[i]) == 0) return 1; } return 0; } // Get color code for token type const char *get_color(TokenType type) { switch (type) { case TOKEN_KEYWORD: return E.theme.COLOR_KEYWORD; case TOKEN_TYPE: return E.theme.COLOR_TYPE; case TOKEN_STRING: return E.theme.COLOR_STRING; case TOKEN_COMMENT: return E.theme.COLOR_COMMENT; case TOKEN_NUMBER: return E.theme.COLOR_NUMBER; default: return E.theme.COLOR_DEFAULT; } } int comment_section = 0; // Highlight a line of C code and return the highlighted string // Returns a newly allocated string that must be freed by the caller char *highlight_line(const char *line, int *length) { // Each byte can expand to at most (color_prefix + 4 bytes + color_reset). // Allocate generously based on line length to avoid overflow. int line_len = strlen(line); int buf_size = line_len * 32 + 256; char *result = malloc(buf_size); int result_pos = 0; int i = 0; while (line[i] != '\0' && line[i] != '\n') { // Skip whitespace — copy full UTF-8 char (whitespace is always ASCII, // but using copy_utf8_char keeps the pattern consistent) if (line[i] == ' ' || line[i] == '\t') { copy_utf8_char(result, &result_pos, line, &i); continue; } if (comment_section) { result_pos += sprintf(&result[result_pos], "%s", E.theme.COLOR_COMMENT); while (line[i] != '\0' && line[i] != '\n') { if (line[i] == '*' && line[i + 1] == '/') { comment_section = 0; } copy_utf8_char(result, &result_pos, line, &i); } result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET); continue; } // Handle line comments if (line[i] == '/' && line[i + 1] == '/') { result_pos += sprintf(&result[result_pos], "%s", E.theme.COLOR_COMMENT); while (line[i] != '\0' && line[i] != '\n') { copy_utf8_char(result, &result_pos, line, &i); } result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET); continue; } // Handle block comments if (line[i] == '/' && line[i + 1] == '*') { comment_section = 1; result_pos += sprintf(&result[result_pos], "%s", E.theme.COLOR_COMMENT); result[result_pos++] = line[i++]; result[result_pos++] = line[i++]; while (line[i] != '\0') { if (line[i] == '*' && line[i + 1] == '/') { result[result_pos++] = line[i++]; result[result_pos++] = line[i++]; comment_section = 0; break; } copy_utf8_char(result, &result_pos, line, &i); } result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET); continue; } // Handle strings if (line[i] == '"') { result_pos += sprintf(&result[result_pos], "%s\"", E.theme.COLOR_STRING); i++; while (line[i] != '\0' && line[i] != '"') { if (line[i] == '\\') { result[result_pos++] = line[i++]; copy_utf8_char(result, &result_pos, line, &i); } else { copy_utf8_char(result, &result_pos, line, &i); } } if (line[i] == '"') result[result_pos++] = line[i++]; result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET); continue; } // Handle character literals if (line[i] == '\'') { result_pos += sprintf(&result[result_pos], "%s'", E.theme.COLOR_STRING); i++; while (line[i] != '\0' && line[i] != '\'') { if (line[i] == '\\') { result[result_pos++] = line[i++]; copy_utf8_char(result, &result_pos, line, &i); } else { copy_utf8_char(result, &result_pos, line, &i); } } if (line[i] == '\'') result[result_pos++] = line[i++]; result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET); continue; } // Handle numbers if (line[i] >= '0' && line[i] <= '9') { result_pos += sprintf(&result[result_pos], "%s", E.theme.COLOR_NUMBER); while (is_word_char(&line[i]) || line[i] == '.') { copy_utf8_char(result, &result_pos, line, &i); } result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET); continue; } // Handle identifiers and keywords if (is_word_char(&line[i])) { int start = i; // Advance by full UTF-8 character widths, not single bytes while (line[i] != '\0' && is_word_char(&line[i])) i += utf8_char_len(&line[i]); char word[256]; int word_len = i - start; if (word_len >= (int)sizeof(word)) word_len = (int)sizeof(word) - 1; strncpy(word, &line[start], word_len); word[word_len] = '\0'; TokenType type = TOKEN_DEFAULT; if (is_keyword(word)) type = TOKEN_KEYWORD; else if (is_type(word)) type = TOKEN_TYPE; result_pos += sprintf(&result[result_pos], "%s%s%s", get_color(type), word, COLOR_RESET); continue; } // Handle operators and other characters (including non-ASCII multi-byte) result_pos += sprintf(&result[result_pos], "%s", E.theme.COLOR_DEFAULT); copy_utf8_char(result, &result_pos, line, &i); result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET); } result[result_pos] = '\0'; *length = result_pos + 1; return result; }