{
  "_id": "6a1ee7afb401979e7341189b",
  "Package": "tok",
  "Title": "Fast Text Tokenization",
  "Version": "0.2.2.9000",
  "Authors@R": "c(\nperson(\"Tomasz\", \"Kalinowski\", , \"tomasz@posit.co\", c(\"ctb\", \"cre\")),\nperson(\"Daniel\", \"Falbel\", , \"dfalbel@gmail.com\", c(\"aut\")),\nperson(\"Regouby\", \"Christophe\", , \"christophe.regouby@free.fr\", c(\"ctb\")),\nperson(family = \"Posit\", role = c(\"cph\"))\n)",
  "Description": "Interfaces with the 'Hugging Face' tokenizers library to\nprovide implementations of today's most used tokenizers such as\nthe 'Byte-Pair Encoding' algorithm\n<https://huggingface.co/docs/tokenizers/index>. It's extremely\nfast for both training new vocabularies and tokenizing texts.",
  "License": "MIT + file LICENSE",
  "SystemRequirements": "Cargo (Rust's package manager), rustc >= 1.77.2",
  "Encoding": "UTF-8",
  "Roxygen": "list(markdown = TRUE)",
  "Config/testthat/edition": "3",
  "URL": "https://github.com/mlverse/tok",
  "BugReports": "https://github.com/mlverse/tok/issues",
  "Config/rextendr/version": "0.5.0",
  "Config/roxygen2/version": "8.0.0",
  "Config/pak/sysreqs": "libclang-dev",
  "Repository": "https://mlverse.r-universe.dev",
  "Date/Publication": "2026-05-19 13:09:10 UTC",
  "RemoteUrl": "https://github.com/mlverse/tok",
  "RemoteRef": "HEAD",
  "RemoteSha": "f925ad65e356dc6d295b633391aa00eae9dad8fe",
  "NeedsCompilation": "yes",
  "Packaged": {
    "Date": "2026-05-19 14:20:54 UTC",
    "User": "root"
  },
  "Author": "Tomasz Kalinowski [ctb, cre],\nDaniel Falbel [aut],\nRegouby Christophe [ctb],\nPosit [cph]",
  "Maintainer": "Tomasz Kalinowski <tomasz@posit.co>",
  "MD5sum": "701d7943f4a99a03e7003a0f0fa7058b",
  "_user": "mlverse",
  "_type": "src",
  "_file": "tok_0.2.2.9000.tar.gz",
  "_fileid": "aebfa89d1ded935dfc02f27d186ff052f86fea75c0eea46c524d7b767198a563",
  "_filesize": 11765020,
  "_sha256": "aebfa89d1ded935dfc02f27d186ff052f86fea75c0eea46c524d7b767198a563",
  "_created": "2026-05-19T14:20:54.000Z",
  "_published": "2026-06-02T14:24:46.996Z",
  "_distro": "noble",
  "_jobs": [
    {
      "job": 79093344918,
      "time": 267,
      "config": "linux-devel-arm64",
      "r": "4.7.0",
      "check": "OK",
      "artifact": "7086308010"
    },
    {
      "job": 79093345056,
      "time": 253,
      "config": "linux-devel-x86_64",
      "r": "4.7.0",
      "check": "OK",
      "artifact": "7086301727"
    },
    {
      "job": 79093345642,
      "time": 256,
      "config": "linux-release-arm64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7086303614"
    },
    {
      "job": 79093345741,
      "time": 290,
      "config": "linux-release-x86_64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7086317990"
    },
    {
      "job": 79093346217,
      "time": 176,
      "config": "macos-oldrel-arm64",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "7086268515"
    },
    {
      "job": 79093346268,
      "time": 413,
      "config": "macos-oldrel-x86_64",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "7086370619"
    },
    {
      "job": 79093345059,
      "time": 209,
      "config": "macos-release-arm64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7086282489"
    },
    {
      "job": 79093345165,
      "time": 411,
      "config": "macos-release-x86_64",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7086370268"
    },
    {
      "job": 79093343582,
      "time": 339,
      "config": "source",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7086190304"
    },
    {
      "job": 79093344248,
      "time": 226,
      "config": "wasm-release",
      "r": "4.6.0",
      "check": "FAIL",
      "artifact": ""
    },
    {
      "job": 79093345135,
      "time": 365,
      "config": "windows-devel",
      "r": "4.7.0",
      "check": "OK",
      "artifact": "7086350621"
    },
    {
      "job": 79093345204,
      "time": 295,
      "config": "windows-oldrel",
      "r": "4.5.3",
      "check": "OK",
      "artifact": "7086320310"
    },
    {
      "job": 79093345775,
      "time": 294,
      "config": "windows-release",
      "r": "4.6.0",
      "check": "OK",
      "artifact": "7086319856"
    }
  ],
  "_buildurl": "https://github.com/r-universe/mlverse/actions/runs/26103110414",
  "_status": "success",
  "_host": "GitHub-Actions",
  "_upstream": "https://github.com/mlverse/tok",
  "_commit": {
    "id": "f925ad65e356dc6d295b633391aa00eae9dad8fe",
    "author": "Tomasz Kalinowski <kalinowskit@gmail.com>",
    "committer": "Tomasz Kalinowski <kalinowskit@gmail.com>",
    "message": "update `cran-comments.md`\n",
    "time": 1779196150
  },
  "_maintainer": {
    "name": "Tomasz Kalinowski",
    "email": "tomasz@posit.co",
    "login": "t-kalinowski",
    "mastodon": "@t_kalinowski@fosstodon.org",
    "bluesky": "@t-kalinowski.bsky.social",
    "description": "",
    "uuid": 8462255
  },
  "_registered": true,
  "_dependencies": [
    {
      "package": "R",
      "version": ">= 4.2.0",
      "role": "Depends"
    },
    {
      "package": "R6",
      "role": "Imports"
    },
    {
      "package": "cli",
      "role": "Imports"
    },
    {
      "package": "rmarkdown",
      "role": "Suggests"
    },
    {
      "package": "testthat",
      "version": ">= 3.0.0",
      "role": "Suggests"
    },
    {
      "package": "hfhub",
      "version": ">= 0.1.1",
      "role": "Suggests"
    },
    {
      "package": "withr",
      "role": "Suggests"
    }
  ],
  "_owner": "mlverse",
  "_selfowned": true,
  "_usedby": 1,
  "_updates": [
    {
      "week": "2025-35",
      "n": 7
    },
    {
      "week": "2025-40",
      "n": 6
    },
    {
      "week": "2026-16",
      "n": 3
    },
    {
      "week": "2026-17",
      "n": 2
    },
    {
      "week": "2026-21",
      "n": 2
    }
  ],
  "_tags": [
    {
      "name": "v0.2.0",
      "date": "2025-08-26"
    },
    {
      "name": "v0.2.1",
      "date": "2025-09-30"
    },
    {
      "name": "v0.2.2",
      "date": "2026-04-21"
    }
  ],
  "_stars": 47,
  "_contributors": [
    {
      "user": "dfalbel",
      "count": 107,
      "uuid": 4706822
    },
    {
      "user": "t-kalinowski",
      "count": 8,
      "uuid": 8462255
    },
    {
      "user": "cregouby",
      "count": 1,
      "uuid": 10136115
    }
  ],
  "_userbio": {
    "uuid": 55406849,
    "type": "organization",
    "name": "mlverse",
    "description": "Open source libraries to scale Data Science"
  },
  "_downloads": {
    "count": 11011,
    "source": "https://cranlogs.r-pkg.org/downloads/total/last-month/tok"
  },
  "_devurl": "https://github.com/mlverse/tok",
  "_searchresults": 15,
  "_cargo": true,
  "_topics": [
    "rust",
    "cargo"
  ],
  "_rbuild": "4.6.0",
  "_assets": [
    "extra/citation.cff",
    "extra/citation.html",
    "extra/citation.json",
    "extra/citation.txt",
    "extra/contents.json",
    "extra/NEWS.html",
    "extra/NEWS.txt",
    "extra/readme.html",
    "extra/readme.md",
    "extra/tok.html",
    "manual.pdf"
  ],
  "_homeurl": "https://github.com/mlverse/tok",
  "_realowner": "mlverse",
  "_cranurl": true,
  "_releases": [
    {
      "version": "0.1.0",
      "date": "2023-07-06"
    },
    {
      "version": "0.1.1",
      "date": "2023-08-18"
    },
    {
      "version": "0.1.2",
      "date": "2024-06-27"
    },
    {
      "version": "0.1.3",
      "date": "2024-07-06"
    },
    {
      "version": "0.1.4",
      "date": "2024-09-04"
    },
    {
      "version": "0.2.0",
      "date": "2025-08-27"
    },
    {
      "version": "0.2.1",
      "date": "2025-10-03"
    },
    {
      "version": "0.2.2",
      "date": "2026-04-22"
    }
  ],
  "_exports": [
    "decoder_byte_level",
    "encoding",
    "model_bpe",
    "model_unigram",
    "model_wordpiece",
    "normalizer_nfc",
    "normalizer_nfkc",
    "pre_tokenizer",
    "pre_tokenizer_byte_level",
    "pre_tokenizer_whitespace",
    "processor_byte_level",
    "tok_decoder",
    "tok_model",
    "tok_normalizer",
    "tok_processor",
    "tok_trainer",
    "tokenizer",
    "trainer_bpe",
    "trainer_unigram",
    "trainer_wordpiece"
  ],
  "_help": [
    {
      "page": "decoder_byte_level",
      "title": "Byte level decoder",
      "concept": [
        "decoders"
      ],
      "topics": [
        "decoder_byte_level"
      ]
    },
    {
      "page": "encoding",
      "title": "Encoding",
      "topics": [
        "encoding"
      ]
    },
    {
      "page": "model_bpe",
      "title": "BPE model",
      "concept": [
        "model"
      ],
      "topics": [
        "model_bpe"
      ]
    },
    {
      "page": "model_unigram",
      "title": "An implementation of the Unigram algorithm",
      "concept": [
        "model"
      ],
      "topics": [
        "model_unigram"
      ]
    },
    {
      "page": "model_wordpiece",
      "title": "An implementation of the WordPiece algorithm",
      "concept": [
        "model"
      ],
      "topics": [
        "model_wordpiece"
      ]
    },
    {
      "page": "normalizer_nfc",
      "title": "NFC normalizer",
      "concept": [
        "normalizers"
      ],
      "topics": [
        "normalizer_nfc"
      ]
    },
    {
      "page": "normalizer_nfkc",
      "title": "NFKC normalizer",
      "concept": [
        "normalizers"
      ],
      "topics": [
        "normalizer_nfkc"
      ]
    },
    {
      "page": "pre_tokenizer",
      "title": "Generic class for tokenizers",
      "concept": [
        "pre_tokenizer"
      ],
      "topics": [
        "pre_tokenizer"
      ]
    },
    {
      "page": "pre_tokenizer_byte_level",
      "title": "Byte level pre tokenizer",
      "concept": [
        "pre_tokenizer"
      ],
      "topics": [
        "pre_tokenizer_byte_level"
      ]
    },
    {
      "page": "pre_tokenizer_whitespace",
      "title": "This pre-tokenizer simply splits using the following regex: \\w+|[^\\w\\s]+",
      "concept": [
        "pre_tokenizer"
      ],
      "topics": [
        "pre_tokenizer_whitespace"
      ]
    },
    {
      "page": "processor_byte_level",
      "title": "Byte Level post processor",
      "concept": [
        "processors"
      ],
      "topics": [
        "processor_byte_level"
      ]
    },
    {
      "page": "tok_decoder",
      "title": "Generic class for decoders",
      "concept": [
        "decoders"
      ],
      "topics": [
        "tok_decoder"
      ]
    },
    {
      "page": "tok_model",
      "title": "Generic class for tokenization models",
      "concept": [
        "model"
      ],
      "topics": [
        "tok_model"
      ]
    },
    {
      "page": "tok_normalizer",
      "title": "Generic class for normalizers",
      "concept": [
        "normalizers"
      ],
      "topics": [
        "tok_normalizer"
      ]
    },
    {
      "page": "tok_processor",
      "title": "Generic class for processors",
      "concept": [
        "processors"
      ],
      "topics": [
        "tok_processor"
      ]
    },
    {
      "page": "tok_trainer",
      "title": "Generic training class",
      "concept": [
        "trainer"
      ],
      "topics": [
        "tok_trainer"
      ]
    },
    {
      "page": "tokenizer",
      "title": "Tokenizer",
      "topics": [
        "tokenizer"
      ]
    },
    {
      "page": "trainer_bpe",
      "title": "BPE trainer",
      "concept": [
        "trainer"
      ],
      "topics": [
        "trainer_bpe"
      ]
    },
    {
      "page": "trainer_unigram",
      "title": "Unigram tokenizer trainer",
      "concept": [
        "trainer"
      ],
      "topics": [
        "trainer_unigram"
      ]
    },
    {
      "page": "trainer_wordpiece",
      "title": "WordPiece tokenizer trainer",
      "concept": [
        "trainer"
      ],
      "topics": [
        "trainer_wordpiece"
      ]
    }
  ],
  "_readme": "https://github.com/mlverse/tok/raw/HEAD/README.md",
  "_rundeps": [
    "cli",
    "R6"
  ],
  "_score": 7.066107138684624,
  "_indexed": true,
  "_nocasepkg": "tok",
  "_universes": [
    "mlverse",
    "t-kalinowski"
  ],
  "_binaries": [
    {
      "r": "4.7.0",
      "os": "linux",
      "version": "0.2.2.9000",
      "date": "2026-05-19T14:25:25.000Z",
      "distro": "noble",
      "arch": "aarch64",
      "commit": "f925ad65e356dc6d295b633391aa00eae9dad8fe",
      "fileid": "6b75fe4031fd149febd8a2cebc262dd272ae5dce8fc68e07ca9cc81bce5c1f30",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/mlverse/actions/runs/26103110414"
    },
    {
      "r": "4.7.0",
      "os": "linux",
      "version": "0.2.2.9000",
      "date": "2026-05-19T14:25:13.000Z",
      "distro": "noble",
      "arch": "x86_64",
      "commit": "f925ad65e356dc6d295b633391aa00eae9dad8fe",
      "fileid": "7832aecc489a93a3f0c8b1bd6659e8ea587f3af3ff5305d0a38893e529192c8c",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/mlverse/actions/runs/26103110414"
    },
    {
      "r": "4.6.0",
      "os": "linux",
      "version": "0.2.2.9000",
      "date": "2026-05-19T14:25:15.000Z",
      "distro": "noble",
      "arch": "aarch64",
      "commit": "f925ad65e356dc6d295b633391aa00eae9dad8fe",
      "fileid": "2b39c0b804484acf35629f82a6e7e40c445ef369d793781d618b0b46a0a11c30",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/mlverse/actions/runs/26103110414"
    },
    {
      "r": "4.6.0",
      "os": "linux",
      "version": "0.2.2.9000",
      "date": "2026-05-19T14:25:51.000Z",
      "distro": "noble",
      "arch": "x86_64",
      "commit": "f925ad65e356dc6d295b633391aa00eae9dad8fe",
      "fileid": "709afc97136b67df530e0785fc61a0ec524fb504f3e81ee863d3c3b004c73d51",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/mlverse/actions/runs/26103110414"
    },
    {
      "r": "4.5.3",
      "os": "mac",
      "version": "0.2.2.9000",
      "date": "2026-05-19T14:23:52.000Z",
      "arch": "aarch64",
      "commit": "f925ad65e356dc6d295b633391aa00eae9dad8fe",
      "fileid": "3e07f49a28521e71ea1854d3f9e156277f7fe06588533797473c67d06058948a",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/mlverse/actions/runs/26103110414"
    },
    {
      "r": "4.5.3",
      "os": "mac",
      "version": "0.2.2.9000",
      "date": "2026-05-19T14:25:36.000Z",
      "arch": "x86_64",
      "commit": "f925ad65e356dc6d295b633391aa00eae9dad8fe",
      "fileid": "64e9bd1c13dc139c763517a7e4ddebbe1943d76333f7843b668b3c06a583b1d2",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/mlverse/actions/runs/26103110414"
    },
    {
      "r": "4.6.0",
      "os": "mac",
      "version": "0.2.2.9000",
      "date": "2026-05-19T14:24:14.000Z",
      "arch": "aarch64",
      "commit": "f925ad65e356dc6d295b633391aa00eae9dad8fe",
      "fileid": "409532887069ad406c1721b964dbed470c0527da5284eb19800ced8c2bfc04eb",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/mlverse/actions/runs/26103110414"
    },
    {
      "r": "4.6.0",
      "os": "mac",
      "version": "0.2.2.9000",
      "date": "2026-05-19T14:25:26.000Z",
      "arch": "x86_64",
      "commit": "f925ad65e356dc6d295b633391aa00eae9dad8fe",
      "fileid": "49bab11ac112383f1d196016a4632403878596541d6498bf0a007f02eb018175",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/mlverse/actions/runs/26103110414"
    },
    {
      "r": "4.7.0",
      "os": "win",
      "version": "0.2.2.9000",
      "date": "2026-05-19T14:24:34.000Z",
      "arch": "x86_64",
      "commit": "f925ad65e356dc6d295b633391aa00eae9dad8fe",
      "fileid": "dfa076045b936c6c290b52cdc557f89eac652bc4ab3bb42f8ba5a769c469274f",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/mlverse/actions/runs/26103110414"
    },
    {
      "r": "4.5.3",
      "os": "win",
      "version": "0.2.2.9000",
      "date": "2026-05-19T14:23:51.000Z",
      "arch": "x86_64",
      "commit": "f925ad65e356dc6d295b633391aa00eae9dad8fe",
      "fileid": "d9a710c9c1d38e027625e84618f66707a5888273325f5ee7a6a77987c22f559b",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/mlverse/actions/runs/26103110414"
    },
    {
      "r": "4.6.0",
      "os": "win",
      "version": "0.2.2.9000",
      "date": "2026-05-19T14:23:49.000Z",
      "arch": "x86_64",
      "commit": "f925ad65e356dc6d295b633391aa00eae9dad8fe",
      "fileid": "77b719a301c4edcda113f77bd347d316672bf62b4926f4341bb46e51c9a80f00",
      "status": "success",
      "check": "OK",
      "buildurl": "https://github.com/r-universe/mlverse/actions/runs/26103110414"
    }
  ]
}