jstok - JavaScript and TypeScript source code tokenizer

Documentation Index

Allows to iterate over tokens (code units) in Javascript or Typescript code.

Example

// To download and run this example:
// curl 'https://raw.githubusercontent.com/jeremiah-shaulov/jstok/v2.0.1/README.md' | perl -ne '$y=$1 if /^```(.)?/;  print <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mrow></mrow><mi>i</mi></msub><mi>f</mi></mrow><annotation encoding="application/x-tex">_ if </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8889em;vertical-align:-0.1944em;"></span><span class="mord"><span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord mathnormal" style="margin-right:0.10764em;">f</span></span></span></span>y&&$m;  <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>m</mi><mo>=</mo></mrow><annotation encoding="application/x-tex">m=</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord mathnormal">m</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span></span></span></span>y&&($m||m~<example-p9mn>~)' > /tmp/example-p9mn.ts
// deno run /tmp/example-p9mn.ts

import {jstok, TokenType} from 'https://deno.land/x/jstok@v2.0.1/mod.ts';
import {assertEquals} from 'jsr:@std/assert@1.0.7/equals';

const source =
`	// Comment
    console.log(\`Current time: \${new Date}\`);
`;

assertEquals
(	[...jstok(source)].map(v => Object.assign<Record<never, never>, unknown>({}, v)),
    [	{nLine: 1, nColumn: 1, level: 0, type: TokenType.WHITESPACE, text: "\t"},
        {nLine: 1, nColumn: 5, level: 0, type: TokenType.COMMENT, text: "// Comment"},
        {nLine: 1, nColumn: 15, level: 0, type: TokenType.WHITESPACE, text: "\n\t"},
        {nLine: 2, nColumn: 5, level: 0, type: TokenType.IDENT, text: "console"},
        {nLine: 2, nColumn: 12, level: 0, type: TokenType.OTHER, text: "."},
        {nLine: 2, nColumn: 13, level: 0, type: TokenType.IDENT, text: "log"},
        {nLine: 2, nColumn: 16, level: 0, type: TokenType.OTHER, text: "("},
        {nLine: 2, nColumn: 17, level: 1, type: TokenType.STRING_TEMPLATE_BEGIN, text: "`Current time: ${"},
        {nLine: 2, nColumn: 34, level: 2, type: TokenType.IDENT, text: "new"},
        {nLine: 2, nColumn: 37, level: 2, type: TokenType.WHITESPACE, text: " "},
        {nLine: 2, nColumn: 38, level: 2, type: TokenType.IDENT, text: "Date"},
        {nLine: 2, nColumn: 42, level: 1, type: TokenType.STRING_TEMPLATE_END, text: "}`"},
        {nLine: 2, nColumn: 44, level: 0, type: TokenType.OTHER, text: ")"},
        {nLine: 2, nColumn: 45, level: 0, type: TokenType.OTHER, text: ";"},
        {nLine: 2, nColumn: 46, level: 0, type: TokenType.MORE_REQUEST, text: "\n"},
        {nLine: 2, nColumn: 46, level: 0, type: TokenType.WHITESPACE, text: "\n"},
    ]
);

for (const token of jstok(source))
{	if (token.type != TokenType.MORE_REQUEST)
    {	console.log(token);
    }
}

jstok() - Tokenize string

function jstok(source: string, tabWidth: number=4, nLine: number=1, nColumn: number=1): Generator<Token, void, string>

This function returns iterator over JavaScript or TypeScript tokens found in a source code provided as a string.

It will start counting lines and chars from the provided nLine and nColumn values. When counting chars, it will respect the desired tabWidth.

Before returning the last token in the source, it generates TokenType.MORE_REQUEST. You can ignore it, or you can react by calling the next it.next(more) function on the iterator with a string argument, that contains code continuation. This code will be concatenated with the contents of the TokenType.MORE_REQUEST, and the tokenization process will continue.

// To download and run this example:
// curl 'https://raw.githubusercontent.com/jeremiah-shaulov/jstok/v2.0.1/README.md' | perl -ne '$y=$1 if /^```(.)?/;  print <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mrow></mrow><mi>i</mi></msub><mi>f</mi></mrow><annotation encoding="application/x-tex">_ if </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8889em;vertical-align:-0.1944em;"></span><span class="mord"><span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord mathnormal" style="margin-right:0.10764em;">f</span></span></span></span>y&&$m;  <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>m</mi><mo>=</mo></mrow><annotation encoding="application/x-tex">m=</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord mathnormal">m</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span></span></span></span>y&&($m||m~<example-65ya>~)' > /tmp/example-65ya.ts
// deno run /tmp/example-65ya.ts

import {jstok, TokenType} from 'https://deno.land/x/jstok@v2.0.1/mod.ts';

let source =
`	// Comment
    console.log(\`Current time: \${new Date}\`);
`;

function getNextPart()
{	const part = source.slice(0, 10);
    source = source.slice(10);
    return part;
}

const it = jstok(getNextPart());
let token;
L:while ((token = it.next().value))
{	while (token.type == TokenType.MORE_REQUEST)
    {	token = it.next(getNextPart()).value;
        if (!token)
        {	break L;
        }
    }

    console.log(token);
}

This library cannot be used to check source code syntax. Though in 2 cases it returns TokenType.ERROR:

  1. if invalid character occured
  2. if unbalanced bracket occured

Token

class Token
{
    🔧 constructor(text: string, type: TokenType, nLine: number=1, nColumn: number=1, level: number=0)
    📄 text: string
    📄 type: TokenType
    📄 nLine: number
    📄 nColumn: number
    📄 level: number
    ⚙ toString(): string
    ⚙ debug(): string
    ⚙ getValue(): string
    ⚙ getNumberValue(): number | bigint
    ⚙ getRegExpValue(): RegExp
}

  • text - original JavaScript token text.
  • type - Token type.
  • nLine - Line number where this token starts.
  • nColumn - Column number on the line where this token starts.
  • level - Nesting level. Entering (, [ and { increments the level counter. Also the level is incremented when entering ${ parameters in string templates.

toString() method returns original JavaScript token (this.text), except for TokenType.MORE_REQUEST, for which it returns empty string.

getValue() method converts JavaScript token to it’s JavaScript value, if the value is string.

getNumberValue() method returns Number or BigInt value of the token for TokenType.NUMBER tokens. For others returns NaN.

getRegExpValue() method returns RegExp object. For TokenType.REGEXP tokens it’s the regular expression that this token represents. For other token types this method returns just a default empty RegExp object.

debug() method returns string with console.log()-ready representation of this Token object for debug purposes.

// To download and run this example:
// curl 'https://raw.githubusercontent.com/jeremiah-shaulov/jstok/v2.0.1/README.md' | perl -ne '$y=$1 if /^```(.)?/;  print <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mrow></mrow><mi>i</mi></msub><mi>f</mi></mrow><annotation encoding="application/x-tex">_ if </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8889em;vertical-align:-0.1944em;"></span><span class="mord"><span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord mathnormal" style="margin-right:0.10764em;">f</span></span></span></span>y&&$m;  <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>m</mi><mo>=</mo></mrow><annotation encoding="application/x-tex">m=</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord mathnormal">m</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span></span></span></span>y&&($m||m~<example-pf4z>~)' > /tmp/example-pf4z.ts
// deno run --allow-read /tmp/example-pf4z.ts

import {jstok} from 'https://deno.land/x/jstok@v2.0.1/mod.ts';

const code = await Deno.readTextFile(new URL(import.meta.url).pathname);
const tokens = [...jstok(code)];
console.log(tokens.map(t => t.debug()).join(',\n') + ',');

TokenType

const enum TokenType
{
    WHITESPACE = 0
    COMMENT = 1
    ATTRIBUTE = 2
    IDENT = 3
    NUMBER = 4
    STRING = 5
    STRING_TEMPLATE = 6
    STRING_TEMPLATE_BEGIN = 7
    STRING_TEMPLATE_MID = 8
    STRING_TEMPLATE_END = 9
    REGEXP = 10
    OTHER = 11
    MORE_REQUEST = 12
    ERROR = 13
}

  • WHITESPACE - Any number of any whitespace characters. Multiple such token types are not generated in sequence.
  • COMMENT - One single-line or multiline comment, or hashbang.
  • ATTRIBUTE - Like @Component.
  • IDENT - Can contain unicode letters. Private property names like #flags are also considered IDENTs.
  • NUMBER - Number.
  • STRING - String.
  • STRING_TEMPLATE - Whole backtick-string, if it has no parameters.
  • STRING_TEMPLATE_BEGIN - First part of a backtick-string, till it’s first parameter. The contents of parameters will be tokenized separately, and returned as corresponding token types.
  • STRING_TEMPLATE_MID - Part of backtick-string between two parameters.
  • STRING_TEMPLATE_END - Last part of backtick-string.
  • REGEXP - Regular expression literal.
  • OTHER - Other tokens, like +, ++, ?., etc.
  • MORE_REQUEST - Before returning the last token found in the source string, jstok() generate this meta-token. If then you call it.next(more) with a nonempty string argument, this string will be appended to the last token, and the tokenization will continue.
  • ERROR - This token type is returned in 2 situations: 1) invalid character occured; 2) unbalanced bracket occured.

jstokStream() - Tokenize ReadableStream

This function allows to tokenize a ReadableStream<Uint8Array> of JavaScript or TypeScript source code. It never generates TokenType.MORE_REQUEST.

function jstokStream(source: ReadableStream<Uint8Array>, tabWidth: number=4, nLine: number=1, nColumn: number=1, decoder: TextDecoder=defaultDecoder): AsyncGenerator<Token, void, any>

It will start counting lines and chars from the provided nLine and nColumn values. When counting chars, it will respect the desired tabWidth.

If decoder is provided, will use it to convert bytes to text.

// To download and run this example:
// curl 'https://raw.githubusercontent.com/jeremiah-shaulov/jstok/v2.0.1/README.md' | perl -ne '$y=$1 if /^```(.)?/;  print <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mrow></mrow><mi>i</mi></msub><mi>f</mi></mrow><annotation encoding="application/x-tex">_ if </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8889em;vertical-align:-0.1944em;"></span><span class="mord"><span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord mathnormal" style="margin-right:0.10764em;">f</span></span></span></span>y&&$m;  <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>m</mi><mo>=</mo></mrow><annotation encoding="application/x-tex">m=</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord mathnormal">m</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span></span></span></span>y&&($m||m~<example-ksv8>~)' > /tmp/example-ksv8.ts
// deno run --allow-read /tmp/example-ksv8.ts

import {jstokStream} from 'https://deno.land/x/jstok@v2.0.1/mod.ts';

const fh = await Deno.open(new URL(import.meta.url).pathname, {read: true});
for await (const token of jstokStream(fh.readable))
{	console.log(token);
}