diff --git a/include/data.h b/include/data.h index 9379214..2e8dfb2 100644 --- a/include/data.h +++ b/include/data.h @@ -8,18 +8,16 @@ #include "lisp.h" /** - * \struct erow + * \struct row_t * \brief Store one editor row * \param * */ -typedef struct frow { +typedef struct row { int size; /**< Size of the line */ - int rsize; /**< Size of the render line */ + int cap; /**< Size of the render line */ char *chars; /**< Characters of the line */ - char *render; /**< The actual line we will print */ -} frow; - +} row_t; /** * @brief Split modes for screen layout @@ -41,7 +39,7 @@ typedef struct { int width; // Width of this pane int cursor_x; // Local cursor x in this pane int cursor_y; // Local cursor y in this pane - int rx, ry; + int rx, ry; int row_offset; // Scroll offset for rows int col_offset; // Scroll offset for columns int is_active; // Is this pane currently active @@ -113,11 +111,17 @@ struct buffer_t { * \brief Containing our editor state. */ struct editorConfig { + int cursor_x, cursor_y; /**< Cursor position */ + int rx; /**< Position in the render*/ + int row_offset; /**< Position scroll of lines */ + int col_offset; /**< Position scroll of colomns*/ int screenrows; /**< Terminal height*/ int screencols; /**< Terminal width*/ ScreenLayout layout; - + + int numrows; /**< Number of rows contained */ + row_t *rows; /**< Store all the rows printed */ int dirty; int prefix_state; char status_msg[80]; @@ -158,4 +162,5 @@ struct abuf { extern struct editorConfig E; + #endif diff --git a/include/define.h b/include/define.h index 3ff01e1..f3f0b18 100644 --- a/include/define.h +++ b/include/define.h @@ -8,19 +8,21 @@ #define HIDE_CURSOR "\x1b[?25l" #define SHOW_CURSOR "\x1b[?25h" #define ERASE_END_LINE "\x1b[K" +#define TAB "\x09" +#define SPACE "\x20" -enum editorKey { - BACKSPACE = 127, - ARROW_LEFT = 1000, - ARROW_RIGHT, - ARROW_UP, - ARROW_DOWN, - DEL_KEY, - BEG_LINE, - END_LINE, - PAGE_UP, - PAGE_DOWN, -}; +enum editorKey_e { + BACKSPACE = 127, + ARROW_LEFT = 1000, + ARROW_RIGHT, + ARROW_UP, + ARROW_DOWN, + DEL_KEY, + BEG_LINE, + END_LINE, + PAGE_UP, + PAGE_DOWN, + }; #define ABUF_INIT {NULL, 0} diff --git a/include/editor_op.h b/include/editor_op.h index db6863b..277a1be 100644 --- a/include/editor_op.h +++ b/include/editor_op.h @@ -6,8 +6,6 @@ void bufferInsertChar(int c); void bufferInsertNewLine(); -void bufferDelChar(); - void editorSetStatusMessage(const char *fmt, ...); #endif // EDITOR_OP_H_ diff --git a/include/row_op.h b/include/row_op.h index 859af50..57bd30d 100644 --- a/include/row_op.h +++ b/include/row_op.h @@ -8,18 +8,15 @@ #include #include -int bufferRowCxToRx(frow *row, int cursor_x); - -int bufferRowRxToCx(frow *row, int rx); - -void bufferUpdatfrow(frow *row); - void bufferInsertRow(struct buffer_t *buffer, int at, char *s, size_t len); +int editorRowCxToByte(const row_t *row, int cursor_x); -void bufferFrefrow(frow *row); +int editorRowCharCount(row_t *row); void bufferDelRow(struct buffer_t *buffer, int at); +void editorRowInsertBytes(row_t *row, int at, const char *src, int len); +void editorRowDelByte(row_t *row, int at, int n); void bufferRowInsertChar(struct buffer_t *buffer, frow *row, int at, int c); void bufferRowAppendString(struct buffer_t *buffer, frow *row, char *s, size_t len); diff --git a/include/terminal.h b/include/terminal.h index 6ab9108..416b7fa 100644 --- a/include/terminal.h +++ b/include/terminal.h @@ -31,4 +31,6 @@ int getCursorPosition(int *rows, int *cols); int getWindowSize(int *rows, int *cols); +char *key_to_string(int key); + #endif diff --git a/include/utf8.h b/include/utf8.h new file mode 100644 index 0000000..2c3f425 --- /dev/null +++ b/include/utf8.h @@ -0,0 +1,16 @@ +// +// Created by Giorgio on 01/05/2026. +// + +#ifndef BELUGA_UTF8_H +#define BELUGA_UTF8_H +#include + +uint32_t readUtf8Char(void); +int utf8Encode(uint32_t cp, char *buf); +int utf8Seqlen(unsigned char c); +int codepointWidth(uint32_t codepoint); +uint32_t utf8Decode(const char** s); + + +#endif //BELUGA_UTF8_H diff --git a/install.sh b/install.sh old mode 100755 new mode 100644 diff --git a/meson.build b/meson.build index cdc101d..9fec757 100644 --- a/meson.build +++ b/meson.build @@ -22,7 +22,8 @@ src_files = files( 'src/terminal.c', 'src/builtins.c', 'src/buffer.c', - 'src/split_screen.c' + 'src/split_screen.c', + 'src/utf8.c' ) # Executable diff --git a/src/append_buffer.c b/src/append_buffer.c index 00e14a3..38cdc05 100644 --- a/src/append_buffer.c +++ b/src/append_buffer.c @@ -1,7 +1,5 @@ #include "../include/append_buffer.h" -extern struct editorConfig E; - void abAppend(struct abuf *ab, const char *s, int len) { char *new = realloc(ab->b, ab->len + len); diff --git a/src/editor_op.c b/src/editor_op.c index 875e5b1..fd9c0b6 100644 --- a/src/editor_op.c +++ b/src/editor_op.c @@ -1,5 +1,3 @@ -#include - #include "../include/editor_op.h" #include "../include/row_op.h" #include "include/buffer.h" @@ -62,21 +60,3 @@ void bufferInsertNewLine() { fprintf(stderr, "Insert new line done\n"); } -void bufferDelChar() { - frow *row; - EditorPane *active = splitScreenGetActivePane(); - struct buffer_t *buf = bufferFindById(active->buffer_id); - if (active->cursor_y == buf->numrows || !(active->cursor_x || active->cursor_y)) { - return; - } - row = &buf->row[active->cursor_y]; - if (active->cursor_x > 0) { - bufferRowDelchar(buf, row, active->cursor_x - 1); - --active->cursor_x; - } else { - active->cursor_x = buf->row[active->cursor_y - 1].size; - bufferRowAppendString(buf, &buf->row[active->cursor_y - 1], row->chars, row->size); - bufferDelRow(buf, active->cursor_y); - --active->cursor_y; - } -} diff --git a/src/file_io.c b/src/file_io.c index cefdf97..1f72b54 100644 --- a/src/file_io.c +++ b/src/file_io.c @@ -9,53 +9,19 @@ #include "../include/file_io.h" #include "../include/editor_op.h" #include "../include/input.h" -#include "include/buffer.h" -#include "include/data.h" -#include "include/split_screen.h" +#include "../include/buffer.h" +#include "../include/data.h" +#include "../include/split_screen.h" #include #include #include #include #include #include +#include -extern char *strdup(const char *); -extern ssize_t getline(char **restrict lineptr, size_t *restrict n, - FILE *restrict stream); -extern int ftruncate(int fd, off_t length); extern struct editorConfig E; -/** - * @brief Converts all editor rows to a single string buffer - * @details Concatenates all row content into a single allocated buffer with - * newlines between rows. Useful for file saving and buffer operations. - * @param buffer_len Pointer to integer where total buffer length will be stored - * @return Pointer to dynamically allocated buffer containing all row data. - * Rows are separated by newline characters. - * @note Caller is responsible for freeing the returned buffer - */ -char *bufferRowsToString(struct buffer_t *buf, int *buffer_len) { - int tot_len = 0; - int j; - char *buffer; - char *p; - - for (j = 0; j < buf->numrows; ++j) { - tot_len += buf->row[j].size + 1; - } - *buffer_len = tot_len; - buffer = malloc(tot_len); - p = buffer; - for (j = 0; j < buf->numrows; ++j) { - memcpy(p, buf->row[j].chars, buf->row[j].size); - p += buf->row[j].size; - *p = '\n'; - p++; - } - - return buffer; -} - /** * @brief Closes the current file and resets editor state * @details Clears all rows, resets cursor position, scroll offsets, and file @@ -191,3 +157,4 @@ void bufferFind(struct buffer_t *buf) { } free(query); } + diff --git a/src/init.c b/src/init.c index eb33325..9278198 100644 --- a/src/init.c +++ b/src/init.c @@ -3,7 +3,7 @@ #include "../include/color.h" #include "../include/data.h" #include "../include/terminal.h" -#include "include/split_screen.h" +#include "../include/split_screen.h" #include #include #include @@ -17,6 +17,7 @@ struct editorConfig; void registerBuiltin(char *key_sequence, LispCFunc f) { lisp_env_define(E.ctx.p->env, lisp_make_symbol(key_sequence, E.ctx), lisp_make_func(f), E.ctx); + } void initBuiltins() { @@ -91,7 +92,7 @@ void initEditor() { } E.screenrows -= 2; - + // Init graphics variables splitScreenInit(); EditorPane *active = splitScreenGetActivePane(); diff --git a/src/input.c b/src/input.c index 69decfb..bc59a02 100644 --- a/src/input.c +++ b/src/input.c @@ -2,16 +2,19 @@ #include "../include/define.h" #include "../include/editor_op.h" #include "../include/output.h" +#include "include/data.h" #include "include/buffer.h" #include "include/data.h" #include "include/split_screen.h" #include +#include #include #include #include #include #include #include +#include extern struct editorConfig E; @@ -78,7 +81,7 @@ const char *file_completion(const char *path) { strcat(full_path, "/"); // add slash for directories } closedir(dir); - + return strdup(full_path); } } @@ -162,83 +165,6 @@ char *editorPrompt(char *prompt, char *placeHolder, char bPathMode) { } } -/** - * @brief Converts a key code to its string representation - * @details Translates raw key codes (including special keys, control keys, - * and regular characters) into human-readable string formats suitable for - * display and keybinding configuration. - * @param key The key code to convert - * @return Pointer to static buffer containing the string representation. - * Examples: "ENTER", "ARROW-UP", "CTRL-a", "TAB", "DELETE", etc. - * @note Returns pointer to static buffer; string is overwritten on next call - * @note Non-printable characters are formatted as "KEY-" - */ -char *key_to_string(int key) { - static char key_str[32]; - - char tmp[10]; - sprintf(tmp, "%d", key); - - // First test enter key - - if (key == '\r') { - strcpy(key_str, "ENTER"); - } else if (key == '\t') { - strcpy(key_str, "TAB"); - } else if (key >= 1 && key <= 26) { // CTRL keys - snprintf(key_str, sizeof(key_str), "CTRL-%c", 'a' + key - 1); - } else { - switch (key) { - case ARROW_UP: - strcpy(key_str, "ARROW-UP"); - break; - case ARROW_DOWN: - strcpy(key_str, "ARROW-DOWN"); - break; - case ARROW_LEFT: - strcpy(key_str, "ARROW-LEFT"); - break; - case ARROW_RIGHT: - strcpy(key_str, "ARROW-RIGHT"); - break; - case PAGE_UP: - strcpy(key_str, "PAGE-UP"); - fprintf(stderr, "pagr up\n"); - break; - case PAGE_DOWN: - strcpy(key_str, "PAGE-DOWN"); - break; - case DEL_KEY: - strcpy(key_str, "DEL"); - - break; - case BACKSPACE: - strcpy(key_str, "BACKSPACE"); - break; - case '\r': - strcpy(key_str, "ENTER"); - break; - case '\x1b': - strcpy(key_str, "ESCAPE"); - break; - case BEG_LINE: - strcpy(key_str, "HOME"); - break; - case END_LINE: - strcpy(key_str, "END"); - break; - default: - // For regular characters - if (isprint(key)) { - snprintf(key_str, sizeof(key_str), "%c", key); - } else { - snprintf(key_str, sizeof(key_str), "KEY-%d", key); - } - } - } - return key_str; -} - /** * @brief Moves the cursor based on arrow key input * @details Updates cursor position (E.cursor_x, E.cursor_y) based on the given @@ -285,8 +211,8 @@ int editorMoveCursor(int key) { } break; } - - + + return 1; } diff --git a/src/row_op.c b/src/row_op.c index ea22004..54cb0cb 100644 --- a/src/row_op.c +++ b/src/row_op.c @@ -1,152 +1,99 @@ #include "../include/row_op.h" +#include "../include/data.h" +#include "../include/define.h" #include #include #include #include +#include "include/utf8.h" + extern struct editorConfig E; -int bufferRowCxToRx(frow *row, int cursor_x) { - int render_x = 0; - int i; - for (i = 0; i < cursor_x; ++i) { - if (row->chars[i] == '\t') { - render_x += (E.constantes.TAB_LENGTH - 1) - (render_x % E.constantes.TAB_LENGTH); - } - render_x++; - } - return render_x; -} - -int bufferRowRxToCx(frow *row, int rx) { - int cur_rx = 0; - int cx; - for (cx = 0; cx < row->size; cx++) { - if (row->chars[cx] == '\t') - cur_rx += (E.constantes.TAB_LENGTH - 1) - (cur_rx % E.constantes.TAB_LENGTH); - cur_rx++; - if (cur_rx > rx) return cx; - } - return cx; -} - -/** - * \fn bufferUpdatfrow(frow *row) - * \brief Copy content of \p row in \p row->render. - * */ - -void bufferUpdatfrow(frow *row) { - int i, i_render; - int tabs = 0; - - // counting number of tabs - - for (i = 0; i < row->size; ++i) { - tabs += - (row->chars[i] == '\t'); /**< increment tabs of 1 if chars[i] is one. */ - } - - free(row->render); - row->render = malloc(row->size + tabs * (E.constantes.TAB_LENGTH - 1) + - 1); /**< Tabs needs E.constantes.TAB_LENGTH chars so E.constantes.TAB_LENGTH - 1 - more than the first already counted. */ - - // end of counting - i_render = 0; - for (i = 0; i < row->size; ++i) { - if (row->chars[i] == '\t') { - row->render[i_render++] = ' '; - while (i_render % E.constantes.TAB_LENGTH) { - row->render[i_render++] = - ' '; /**< Addind the right amount of spaces for tabs */ - } - } else { - row->render[i_render++] = row->chars[i]; - } - } - row->render[i_render] = '\0'; // Don't forget the end of string character. - row->rsize = i_render; -} void bufferInsertRow(struct buffer_t *buffer, int at, char *s, size_t len) { - + if (at < 0 || at > buffer->numrows) { return; } - frow *tmp = (frow *)realloc(buffer->row, sizeof(frow) * (buffer->numrows + 1)); + row_t *tmp = (row_t *)realloc(buffer->row, sizeof(row_t) * (buffer->numrows + 1)); if (!tmp) { return; } buffer->row = tmp; - memmove(&buffer->row[at + 1], &buffer->row[at], sizeof(frow) * (buffer->numrows - at)); + memmove(&buffer->row[at + 1], &buffer->row[at], sizeof(row_t) * (buffer->numrows - at)); buffer->row[at].size = len; + buffer->row[at].cap = len + 1; buffer->row[at].chars = malloc(len + 1); memcpy(buffer->row[at].chars, s, len); - buffer->row[at].chars[len] = '\0'; + buffer->row[at].chars[len] = '\n'; - buffer->row[at].rsize = 0; - buffer->row[at].render = NULL; - bufferUpdatfrow(&buffer->row[at]); ++buffer->numrows; ++buffer->dirty; } -void bufferFrefrow(frow *row) { - free(row->render); +void bufferFreeRow(row_t *row) { free(row->chars); } -void bufferDelRow(struct buffer_t *buffer, int at) { - if (at < 0 || at >= buffer->numrows) { - return; +int editorRowCxToByte(const row_t *row, int cursor_x) { + int i = 0, col = 0; + while (col < cursor_x && i < row->size) { + int sl = utf8Seqlen((unsigned char)row->chars[i]); + if (sl < 1) sl = 1; + col++; + i += sl; } - bufferFrefrow(&buffer->row[at]); - memmove(&buffer->row[at], &buffer->row[at + 1], sizeof(frow) * (buffer->numrows - at - 1)); - --buffer->numrows; - ++buffer->dirty; + return i; } /** - * \fn bufferRowInsertChar(frow *row, int at, int c) + * \fn editorRowInsertChar(erow *row, int at, int c) * \param at Index of where we want to insert the char */ -void bufferRowInsertChar(struct buffer_t *buffer, frow *row, int at, int c) { +void bufferRowInsertBytes(struct buffer_t *buffer, row_t *row, int at, char *src, int n) { if (buffer->state == READ_ONLY) return; - if (at < 0 || at > row->size) { - at = row->size; + if (row->size + n + 1 > row->cap) { + row->cap = (row->size + n + 1) * 2; + row->chars = realloc(row->chars, row->cap); } + memmove(row->chars + at + n, row->chars + at, row->size - at); + memcpy(row->chars + at, src, n); + row->size += n; row->chars = realloc(row->chars, row->size + 2); memmove(&row->chars[at + 1], &row->chars[at], row->size - at + 1); ++row->size; - row->chars[at] = c; - bufferUpdatfrow(row); ++buffer->dirty; } -void bufferRowAppendString(struct buffer_t *buffer, frow *row, char *s, size_t len) { - row->chars = realloc(row->chars, row->size + len + 1); - memcpy(&row->chars[row->size], s, len); - row->size += len; - row->chars[row->size] = '\0'; - bufferUpdatfrow(row); - ++buffer->dirty; -} /** * \fn bufferRowDelChar(struct bufferConfig *E, frow *frow, int at) * \brief Delete the a char at the chosen position on the given row * \param at Index of the char to delete * \param row Row on operation is made */ -void bufferRowDelchar(struct buffer_t *buffer, frow *row, int at) { +void bufferRowDelByte(struct buffer_t *buffer, row_t *row, int at, int n) +{ if (at < 0 || at >= row->size) { return; + memmove(row->chars + at, row->chars + at + n, row->size - at - n); + row->size -= n; + row->chars[row->size] = '\0'; } - memmove(&row->chars[at], &row->chars[at + 1], row->size - at); - --row->size; - bufferUpdatfrow(row); ++buffer->dirty; } +int editorRowCharCount(row_t *row) + { + int n = 0, i = 0; + while (i < row->size) { + int sl = utf8Seqlen((unsigned char)row->chars[i]); + if (sl < 1) sl = 1; + n++; i += sl; + } + return n; + } + diff --git a/src/terminal.c b/src/terminal.c index 521958a..15899dc 100644 --- a/src/terminal.c +++ b/src/terminal.c @@ -1,7 +1,15 @@ #include "../include/terminal.h" + +#include + #include "../include/data.h" +#include "../include/define.h" #include +#include +#include + +#include "include/utf8.h" void die(const char *s) { write(STDOUT_FILENO, "\x1b[2J", 4); @@ -35,73 +43,97 @@ void enableRawMode() { } } -int editorReadKey() { - int nread; - char c; - char seq[3]; - while ((nread = read(STDIN_FILENO, &c, 1)) != 1) { - if (nread == -1 && errno != EAGAIN) { - die("read"); - } - } +#include /* isprint */ - if (c == '\x1b') { - if (read(STDIN_FILENO, &seq[0], 1) != 1 || - read(STDIN_FILENO, &seq[1], 1) != 1) { - return '\x1b'; - } - if (seq[0] == '[') { - if (seq[1] >= '0' && seq[1] <= '9') { - if (read(STDIN_FILENO, &seq[2], 1) != 1) { - return '\x1b'; - } - if (seq[2] == '~') { - switch (seq[1]) { - case '1': - return BEG_LINE; - case '3': - return DEL_KEY; - case '4': - return END_LINE; - case '5': - return PAGE_UP; - case '6': - return PAGE_DOWN; - case '7': - return BEG_LINE; - case '8': - return END_LINE; - } - } - } else { +char *key_to_string(int key) { + static char key_str[32]; - switch (seq[1]) { - case 'A': - return ARROW_UP; - case 'B': - return ARROW_DOWN; - case 'C': - return ARROW_RIGHT; - case 'D': - return ARROW_LEFT; - case 'H': - return BEG_LINE; - case 'F': - return END_LINE; - } - } - } else if (seq[0] == 'O') { - switch (seq[1]) { - case 'H': - return BEG_LINE; - case 'F': - return END_LINE; - } - } - return '\x1b'; + if (key == '\r') { + strcpy(key_str, "ENTER"); + } else if (key >= 1 && key <= 26) { + snprintf(key_str, sizeof(key_str), "CTRL-%c", 'a' + key - 1); } else { - return c; + switch (key) { + case ARROW_UP: strcpy(key_str, "ARROW-UP"); break; + case ARROW_DOWN: strcpy(key_str, "ARROW-DOWN"); break; + case ARROW_LEFT: strcpy(key_str, "ARROW-LEFT"); break; + case ARROW_RIGHT: strcpy(key_str, "ARROW-RIGHT"); break; + case PAGE_UP: strcpy(key_str, "PAGE-UP"); break; + case PAGE_DOWN: strcpy(key_str, "PAGE-DOWN"); break; + case DEL_KEY: strcpy(key_str, "DEL"); break; + case BACKSPACE: strcpy(key_str, "BACKSPACE"); break; + case BEG_LINE: strcpy(key_str, "HOME"); break; + case END_LINE: strcpy(key_str, "END"); break; + case '\x1b': strcpy(key_str, "ESCAPE"); break; + default: + if (key > 127) { + /* UTF-8 code point — re-encode into the buffer */ + char buf[5] = {0}; + int n = utf8Encode((uint32_t)key, buf); + snprintf(key_str, sizeof(key_str), "%.*s", n, buf); + } else if (isprint(key)) { + snprintf(key_str, sizeof(key_str), "%c", key); + } else { + snprintf(key_str, sizeof(key_str), "KEY-%d", key); + } + } } + return key_str; +} + +int editorReadKey() { + char c; + /* read first byte — may be start of UTF-8 or escape */ + while (read(STDIN_FILENO, &c, 1) != 1); + + if (c == '\x1b') { + char seq[6]; + /* try to read escape sequence */ + if (read(STDIN_FILENO, &seq[0], 1) != 1) return '\x1b'; + if (read(STDIN_FILENO, &seq[1], 1) != 1) return '\x1b'; + if (seq[0] == '[') { + if (seq[1] >= '0' && seq[1] <= '9') { + if (read(STDIN_FILENO, &seq[2], 1) != 1) return '\x1b'; + if (seq[2] == '~') { + switch (seq[1]) { + case '1': return BEG_LINE; + case '3': return DEL_KEY; + case '4': return END_LINE; + case '5': return PAGE_UP; + case '6': return PAGE_DOWN; + case '7': return BEG_LINE; + case '8': return END_LINE; + } + } + } else { + switch (seq[1]) { + case 'A': return ARROW_UP; + case 'B': return ARROW_DOWN; + case 'C': return ARROW_RIGHT; + case 'D': return ARROW_LEFT; + case 'H': return BEG_LINE; + case 'F': return END_LINE; + } + } + } + return '\x1b'; + } + + /* multi-byte UTF-8: read remaining bytes */ + int seqlen = utf8Seqlen((unsigned char)c); + if (seqlen > 1) { + /* pack into a pseudo-codepoint just to pass bytes through; + we handle encoding/decoding at the row level */ + char buf[4] = {c, 0, 0, 0}; + for (int i = 1; i < seqlen; i++) + if (read(STDIN_FILENO, &buf[i], 1) != 1) break; + /* decode and return as uint32, but we need int — use high range */ + const char *p = buf; + uint32_t cp = utf8Decode(&p); + return (int)cp; /* caller re-encodes when inserting */ + } + + return (unsigned char)c; } int getCursorPosition(int *rows, int *cols) { diff --git a/src/utf8.c b/src/utf8.c new file mode 100644 index 0000000..10db8b3 --- /dev/null +++ b/src/utf8.c @@ -0,0 +1,148 @@ +/** + * @file utf8.c + */ + +#include "../include/utf8.h" +#include "../include/data.h" + +#include +#include + + +uint32_t readUtf8Char(void) +{ + unsigned char buf[4]; + + read(STDIN_FILENO, &buf[0], 1); + + int extra; + uint32_t cp; + + if (buf[0] < 0x80) + { + cp = buf[0]; + extra = 0; + } + else if (buf[0] < 0xC0) { return 0xFFFD; } // stray continuation + else if (buf[0] < 0xE0) + { + cp = buf[0] & 0x1F; + extra = 1; + } + else if (buf[0] < 0xF0) + { + cp = buf[0] & 0x0F; + extra = 2; + } + else + { + cp = buf[0] & 0x07; + extra = 3; + } + + if (extra > 0) + { + read(STDIN_FILENO, &buf[1], extra); // read remaining bytes at once + for (int i = 0; i < extra; i++) + cp = (cp << 6) | (buf[1 + i] & 0x3F); + } + + return cp; +} + +uint32_t utf8Decode(const char** s) +{ + unsigned char c = (unsigned char)**s; + uint32_t cp; + int extra; + if (c < 0x80) + { + cp = c; + extra = 0; + } + else if (c < 0xC0) + { + (*s)++; + return 0xFFFD; + } + else if (c < 0xE0) + { + cp = c & 0x1F; + extra = 1; + } + else if (c < 0xF0) + { + cp = c & 0x0F; + extra = 2; + } + else + { + cp = c & 0x07; + extra = 3; + } + (*s)++; + while (extra--) + { + c = (unsigned char)**s; + if ((c & 0xC0) != 0x80) return 0xFFFD; + cp = (cp << 6) | (c & 0x3F); + (*s)++; + } + return cp; +} + +// buf must have at least 4 bytes; returns bytes written +int utf8Encode(uint32_t cp, char* buf) +{ + if (cp < 0x80) + { + buf[0] = cp; + return 1; + } + if (cp < 0x800) + { + buf[0] = 0xC0 | (cp >> 6); + buf[1] = 0x80 | (cp & 0x3F); + return 2; + } + if (cp < 0x10000) + { + buf[0] = 0xE0 | (cp >> 12); + buf[1] = 0x80 | ((cp >> 6) & 0x3F); + buf[2] = 0x80 | (cp & 0x3F); + return 3; + } + buf[0] = 0xF0 | (cp >> 18); + buf[1] = 0x80 | ((cp >> 12) & 0x3F); + buf[2] = 0x80 | ((cp >> 6) & 0x3F); + buf[3] = 0x80 | (cp & 0x3F); + return 4; +} + +int utf8Seqlen(unsigned char c) +{ + if (c < 0x80) return 1; + if (c < 0xC0) return 0; /* continuation — shouldn't be leading */ + if (c < 0xE0) return 2; + if (c < 0xF0) return 3; + return 4; +} + +/** + * @param codepoint utf8 codepoint of a char + * @return length of the codepoint + */ +int codepointWidth(uint32_t codepoint) +{ + if (codepoint < 0x20 || codepoint == 0x7F) return 0; + /* rough double-width ranges */ + if ((codepoint >= 0x1100 && codepoint <= 0x115F) || + (codepoint >= 0x2E80 && codepoint <= 0x303E) || + (codepoint >= 0x3041 && codepoint <= 0x33BF) || + (codepoint >= 0xAC00 && codepoint <= 0xD7AF) || + (codepoint >= 0xF900 && codepoint <= 0xFAFF) || + (codepoint >= 0xFF01 && codepoint <= 0xFF60) || + (codepoint >= 0x1F300 && codepoint <= 0x1FAFF)) + return 2; + return 1; +}