Skip to content

Tokenizers

dnallm.models.tokenizer

Classes

DNAOneHotTokenizer

DNAOneHotTokenizer(
    max_length=196608,
    padding_side="right",
    return_embeds=False,
    embeds_transpose=False,
)
Source code in dnallm/models/tokenizer.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def __init__(
    self,
    max_length: int | None = 196_608,
    padding_side: str = "right",
    return_embeds: bool = False,
    embeds_transpose: bool = False,
):
    self.max_length = max_length
    self.padding_side = padding_side
    self.return_embeds = return_embeds
    self.embeds_transpose = embeds_transpose

    self.token_to_id = {
        "A": 0,
        "a": 0,
        "C": 1,
        "c": 1,
        "G": 2,
        "g": 2,
        "T": 3,
        "t": 3,
        "U": 3,
        "u": 3,  # RNA
        "N": 4,
        "n": 4,
        "X": 4,
        "x": 4,
        "-": -1,
        ".": -1,  # Padding
    }

    self.id_to_token = {0: "A", 1: "C", 2: "G", 3: "T", 4: "N", -1: "-"}

    self.pad_token_id = -1
    self.pad_token = "-"  # noqa: S105
    self.unk_token_id = 4  # N

    self.vocab_vectors = torch.tensor(
        [
            [1.0, 0.0, 0.0, 0.0],  # 0: A
            [0.0, 1.0, 0.0, 0.0],  # 1: C
            [0.0, 0.0, 1.0, 0.0],  # 2: G
            [0.0, 0.0, 0.0, 1.0],  # 3: T / U
            [0.25, 0.25, 0.25, 0.25],  # 4: N / X
            [0.0, 0.0, 0.0, 0.0],  # 5: Padding, index=-1
        ],
        dtype=torch.float32,
    )
Functions
from_pretrained classmethod
from_pretrained(pretrained_model_name_or_path, **kwargs)

Load Tokenizer from a specified directory.

Source code in dnallm/models/tokenizer.py
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
    """
    Load Tokenizer from a specified directory.
    """
    config_file = os.path.join(
        pretrained_model_name_or_path, "tokenizer_config.json"
    )

    if os.path.isfile(config_file):
        with open(config_file, encoding="utf-8") as f:
            config = json.load(f)

        # Remove some metadata that should not be passed to __init__
        config.pop("tokenizer_class", None)
        config.pop("pad_token_id", None)
        config.pop("unk_token_id", None)

        config.update(kwargs)

        return cls(**config)
    else:
        print(
            "Warning: tokenizer_config.json not found",
            f"in {pretrained_model_name_or_path}. Using default init.",
        )
        return cls(**kwargs)
save_pretrained
save_pretrained(save_directory, **kwargs)

Save tokenizer configuration and vocabulary. Generates: tokenizer_config.json and vocab.json

Source code in dnallm/models/tokenizer.py
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
def save_pretrained(self, save_directory: str, **kwargs):
    """
    Save tokenizer configuration and vocabulary.
    Generates: tokenizer_config.json and vocab.json
    """
    os.makedirs(save_directory, exist_ok=True)

    config = {
        "max_length": self.max_length,
        "padding_side": self.padding_side,
        "pad_token_id": self.pad_token_id,
        "unk_token_id": self.unk_token_id,
        "tokenizer_class": self.__class__.__name__,
    }

    config_file = os.path.join(save_directory, "tokenizer_config.json")
    with open(config_file, "w", encoding="utf-8") as f:
        json.dump(config, f, indent=2, ensure_ascii=False)

    vocab_file = os.path.join(save_directory, "vocab.json")
    with open(vocab_file, "w", encoding="utf-8") as f:
        json.dump(self.token_to_id, f, indent=2, ensure_ascii=False)

    return [config_file, vocab_file]