Home

Go Back

JS-to-HTML Syntax highlighter

I stumbled upon sugar-high, a lightweight JSX syntax highlighter, project on github. Since I have time on my hands, I thought why not go through the source code and try to implement a similar syntax highlighter myself. I wrote the tokenizer and implemented a simple JSHighlighter component. The final outcome is far from perfect and I can already imagine how much work it would be to implement a full fledged one like in a code editor.

Result 1

This is an example of the syntax highlighting on the tokenizer code itself.

import {
  ValidIdentifierContinue,
  ValidIdentifierStart,
  Keywords,
  T_IDENTIFIER,
  Whitespace,
  T_WHITESPACE,
  T_LINEBREAK,
  T_SINGLE_LINE_COMMENT,
  T_MULTI_LINE_COMMENT,
  Puncuators,
  T_PUNCTUATOR,
  T_KEYWORD,
  T_STRING,
  T_NUMERIC,
} from "./tokenizer.constants";

function isIdentifierContinue(codePoint) {
  if (ValidIdentifierContinue.has(codePoint)) {
    return true;
  }

  // All ASCII identifier start code points are listed above
  if (codePoint  0x7f) {
    return false;
  }

  // ZWNJ and ZWJ are allowed in identifiers
  if (codePoint == 0x200c || codePoint == 0x200d) {
    return true;
  }

  return false;
}

export function tokenize(code) {
  const tokens = [];

  let i = 0;
  let __strTemplateExprStack = 0;
  let __strTemplateQuoteStack = 0;
  const inStrTemplateLiterals = () =
    __strTemplateQuoteStack  __strTemplateExprStack;
  const inStrTemplateExpr = () =
    __strTemplateQuoteStack  0 &&
    __strTemplateQuoteStack === __strTemplateExprStack;
  const scanTemplateString = () = {
    const start = i;
    let end = i + 1;
    while (true) {
      if (code[end] === "$" && code[end + 1] === "{") {
        __strTemplateExprStack++;
        end--;
        break;
      }
      if (code[end] === "`") {
        __strTemplateQuoteStack--;
        break;
      }
      end++;
    }
    tokens.push({
      type: T_STRING,
      value: code.slice(start, end + 1),
    });
    i = end + 1;
  };

  while (i  code.length) {
    const char = code[i];

    if (inStrTemplateLiterals()) {
      if (char === "`") {
        __strTemplateQuoteStack--;
        tokens.push({
          type: T_STRING,
          value: "`",
        });
        i++;
        continue;
      }
      scanTemplateString();
      continue;
    }

    if (char === "`") {
      __strTemplateQuoteStack++;
      scanTemplateString();
      continue;
    }

    if (Whitespace.has(char)) {
      const start = i;
      let end = i + 1;
      while (Whitespace.has(code[end])) {
        end++;
      }
      tokens.push({
        type: T_WHITESPACE,
        value: code.slice(start, end),
      });
      i = end;
      continue;
    }

    if (char === "\n") {
      tokens.push({
        type: T_LINEBREAK,
        value: char,
      });
      i++;
      continue;
    }

    if (char === "/") {
      const nextChar = code[i + 1];
      if (nextChar === "/") {
        const start = i;
        let end = i + 1;
        while (code[end] !== "\n") {
          end++;
        }
        tokens.push({
          type: T_SINGLE_LINE_COMMENT,
          value: code.slice(start, end),
        });
        i = end;
        continue;
      }

      if (nextChar === "*") {
        const start = i;
        let end = i + 1;
        while (code[end] !== "*" || code[end + 1] !== "/") {
          end++;
        }
        tokens.push({
          type: T_MULTI_LINE_COMMENT,
          value: code.slice(start, end + 2),
        });
        i = end + 2;
        continue;
      }
    }

    if (char === "'" || char === '"') {
      const start = i;
      let end = i + 1;
      while (code[end] !== char) {
        end++;
      }
      tokens.push({
        type: T_STRING,
        value: code.slice(start, end + 1),
      });
      i = end + 1;
      continue;
    }

    if (Puncuators.has(char)) {
      if (inStrTemplateExpr() && char === "}") {
        __strTemplateExprStack--;
      }

      tokens.push({
        type: T_PUNCTUATOR,
        value: char,
      });
      i++;
      continue;
    }

    if (ValidIdentifierStart.has(char)) {
      const start = i;
      let end = i + 1;
      while (isIdentifierContinue(code[end])) {
        end++;
      }
      const identifier = code.slice(start, end);
      const type = Keywords.has(identifier) ? T_KEYWORD : T_IDENTIFIER;
      tokens.push({ type, value: identifier });
      i = end;
      continue;
    }

    if (/\d/.test(char)) {
      const start = i;
      let end = i + 1;
      while (/[1-9a-z_]/.test(code[end])) {
        end++;
      }
      tokens.push({
        type: T_NUMERIC,
        value: code.slice(start, end),
      });
      i = end;
      continue;
    }

    i++;
  }

  return tokens;
}

Result 2

Another example with string templates.

`Hello ${lol + 'banana'} 7 ${`bye bye` + "pineapple"}`;

Result 3

Last example with multiline comments.

/**
 * This is a multi-line JavaScript comment
 */