temp commit

2026-05-22 13:40:03 +02:00
parent bc66e673fa
commit edb5384f0e
23 changed files with 848 additions and 621 deletions
@@ -1,5 +1,6 @@
 #include "../include/syntax_highlighter.h"
 #include "../include/data.h"
+#include "../include/utf8.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -18,10 +19,52 @@ const char *c_types[] = {"int",    "char", "float", "double",
                         "void",   "long", "short", "unsigned",
                         "signed", "bool", NULL};

+// Returns the byte length of the UTF-8 character starting at s.
+// Never returns 0 for a non-NUL byte, so callers won't infinite-loop.
+static int utf8_char_len(const char *s)
+{
+    unsigned char c = (unsigned char)*s;
+    if (c == 0)              return 0;
+    if (c < 0x80)            return 1;
+    if ((c & 0xE0) == 0xC0)  return 2;
+    if ((c & 0xF0) == 0xE0)  return 3;
+    if ((c & 0xF8) == 0xF0)  return 4;
+    return 1; // continuation byte or invalid — advance 1 to avoid infinite loop
+}
+
+// Copy one full UTF-8 character from src+i into dst+pos, advance both indices.
+static void copy_utf8_char(char *dst, int *dst_pos, const char *src, int *src_pos)
+{
+    int len = utf8_char_len(&src[*src_pos]);
+    for (int b = 0; b < len; b++)
+        dst[(*dst_pos)++] = src[(*src_pos)++];
+}
+
 // Check if character is alphanumeric or underscore
-int is_word_char(char c) {
-  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
-         (c >= '0' && c <= '9') || c == '_' || c == '#';
+int is_word_char(const char *s)
+{
+    uint32_t cp = utf8Decode(&s);
+
+    if ((cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z') ||
+        (cp >= '0' && cp <= '9') || cp == '_' || cp == '#')
+        return 1;
+
+    if (cp == 0xFFFD) return 0;
+
+    if (cp >= 0x00C0 && cp <= 0x017F) return 1;
+    if (cp >= 0x0370 && cp <= 0x03FF) return 1;
+    if (cp >= 0x0400 && cp <= 0x04FF) return 1;
+    if (cp >= 0x0600 && cp <= 0x06FF) return 1;
+    if (cp >= 0x05D0 && cp <= 0x05EA) return 1;
+    if (cp >= 0x0900 && cp <= 0x097F) return 1;
+    if (cp >= 0x4E00 && cp <= 0x9FFF) return 1;
+    if ((cp >= 0x3040 && cp <= 0x309F) ||
+        (cp >= 0x30A0 && cp <= 0x30FF)) return 1;
+    if (cp >= 0xAC00 && cp <= 0xD7A3) return 1;
+    if ((cp >= 0x0660 && cp <= 0x0669) ||
+        (cp >= 0x06F0 && cp <= 0x06F9)) return 1;
+
+    return 0;
 }

 // Check if string is a keyword
@@ -65,26 +108,29 @@ int comment_section = 0;
 // Highlight a line of C code and return the highlighted string
 // Returns a newly allocated string that must be freed by the caller
 char *highlight_line(const char *line, int *length) {
-  char *result = malloc(1024); // Allocate space for result
+  // Each byte can expand to at most (color_prefix + 4 bytes + color_reset).
+  // Allocate generously based on line length to avoid overflow.
+  int line_len = strlen(line);
+  int buf_size = line_len * 32 + 256;
+  char *result = malloc(buf_size);
  int result_pos = 0;
  int i = 0;

  while (line[i] != '\0' && line[i] != '\n') {
-    // Skip whitespace
+    // Skip whitespace — copy full UTF-8 char (whitespace is always ASCII,
+    // but using copy_utf8_char keeps the pattern consistent)
    if (line[i] == ' ' || line[i] == '\t') {
-      result[result_pos++] = line[i++];
+      copy_utf8_char(result, &result_pos, line, &i);
      continue;
    }
-    
+
    if (comment_section) {
      result_pos += sprintf(&result[result_pos], "%s", E.theme.COLOR_COMMENT);
      while (line[i] != '\0' && line[i] != '\n') {
-        
-    if (line[i] == '*' && line[i + 1] == '/') {
-      comment_section = 0;
-    }
-
-        result[result_pos++] = line[i++];
+        if (line[i] == '*' && line[i + 1] == '/') {
+          comment_section = 0;
+        }
+        copy_utf8_char(result, &result_pos, line, &i);
      }
      result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET);
      continue;
@@ -94,14 +140,14 @@ char *highlight_line(const char *line, int *length) {
    if (line[i] == '/' && line[i + 1] == '/') {
      result_pos += sprintf(&result[result_pos], "%s", E.theme.COLOR_COMMENT);
      while (line[i] != '\0' && line[i] != '\n') {
-        result[result_pos++] = line[i++];
+        copy_utf8_char(result, &result_pos, line, &i);
      }
      result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET);
      continue;
    }

    // Handle block comments
-    if ((line[i] == '/' && line[i + 1] == '*')) {
+    if (line[i] == '/' && line[i + 1] == '*') {
      comment_section = 1;
      result_pos += sprintf(&result[result_pos], "%s", E.theme.COLOR_COMMENT);
      result[result_pos++] = line[i++];
@@ -110,9 +156,10 @@ char *highlight_line(const char *line, int *length) {
        if (line[i] == '*' && line[i + 1] == '/') {
          result[result_pos++] = line[i++];
          result[result_pos++] = line[i++];
+          comment_section = 0;
          break;
        }
-        result[result_pos++] = line[i++];
+        copy_utf8_char(result, &result_pos, line, &i);
      }
      result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET);
      continue;
@@ -125,9 +172,9 @@ char *highlight_line(const char *line, int *length) {
      while (line[i] != '\0' && line[i] != '"') {
        if (line[i] == '\\') {
          result[result_pos++] = line[i++];
-          result[result_pos++] = line[i++];
+          copy_utf8_char(result, &result_pos, line, &i);
        } else {
-          result[result_pos++] = line[i++];
+          copy_utf8_char(result, &result_pos, line, &i);
        }
      }
      if (line[i] == '"')
@@ -143,9 +190,9 @@ char *highlight_line(const char *line, int *length) {
      while (line[i] != '\0' && line[i] != '\'') {
        if (line[i] == '\\') {
          result[result_pos++] = line[i++];
-          result[result_pos++] = line[i++];
+          copy_utf8_char(result, &result_pos, line, &i);
        } else {
-          result[result_pos++] = line[i++];
+          copy_utf8_char(result, &result_pos, line, &i);
        }
      }
      if (line[i] == '\'')
@@ -157,22 +204,26 @@ char *highlight_line(const char *line, int *length) {
    // Handle numbers
    if (line[i] >= '0' && line[i] <= '9') {
      result_pos += sprintf(&result[result_pos], "%s", E.theme.COLOR_NUMBER);
-      while (is_word_char(line[i]) || line[i] == '.') {
-        result[result_pos++] = line[i++];
+      while (is_word_char(&line[i]) || line[i] == '.') {
+        copy_utf8_char(result, &result_pos, line, &i);
      }
      result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET);
      continue;
    }

    // Handle identifiers and keywords
-    if (is_word_char(line[i])) {
+    if (is_word_char(&line[i])) {
      int start = i;
-      while (is_word_char(line[i]))
-        i++;
+      // Advance by full UTF-8 character widths, not single bytes
+      while (line[i] != '\0' && is_word_char(&line[i]))
+        i += utf8_char_len(&line[i]);

      char word[256];
-      strncpy(word, &line[start], i - start);
-      word[i - start] = '\0';
+      int word_len = i - start;
+      if (word_len >= (int)sizeof(word))
+        word_len = (int)sizeof(word) - 1;
+      strncpy(word, &line[start], word_len);
+      word[word_len] = '\0';

      TokenType type = TOKEN_DEFAULT;
      if (is_keyword(word))
@@ -185,9 +236,9 @@ char *highlight_line(const char *line, int *length) {
      continue;
    }

-    // Handle operators and other characters
+    // Handle operators and other characters (including non-ASCII multi-byte)
    result_pos += sprintf(&result[result_pos], "%s", E.theme.COLOR_DEFAULT);
-    result[result_pos++] = line[i++];
+    copy_utf8_char(result, &result_pos, line, &i);
    result_pos += sprintf(&result[result_pos], "%s", COLOR_RESET);
  }