Skip to content

utils/sequence API

dnallm.utils.sequence

Sequence utility functions for DNA sequence analysis and generation.

This module provides functions for: - Calculating GC content - Generating reverse complements - Converting sequences to k-mers - Validating DNA sequences - Randomly generating DNA sequences with constraints

All functions are designed for use in DNA language modeling and bioinformatics pipelines.

Functions

calc_gc_content

calc_gc_content(seq)

Calculate the GC content of a DNA sequence.

Parameters:

Name Type Description Default
seq str

DNA sequence (A/C/G/T/U/N, case-insensitive).

required

Returns:

Name Type Description
float float

GC content (0.0 ~ 1.0). Returns 0.0 if sequence is empty.

Source code in dnallm/utils/sequence.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def calc_gc_content(seq: str) -> float:
    """
    Calculate the GC content of a DNA sequence.

    Args:
        seq (str): DNA sequence (A/C/G/T/U/N, case-insensitive).

    Returns:
        float: GC content (0.0 ~ 1.0). Returns 0.0 if sequence is empty.
    """
    seq = seq.upper().replace("U", "T").replace("N", "")
    if len(seq) == 0:
        gc = 0.0
    else:
        gc = (seq.count("G") + seq.count("C")) / len(seq)
    return gc

check_sequence

check_sequence(
    seq,
    minl=1,
    maxl=500000000,
    gc=(0, 1),
    valid_chars="ACGTN",
)

Check if a DNA sequence is valid based on length, GC content, and allowed characters.

Parameters:

Name Type Description Default
seq str

DNA sequence.

required
minl int

Minimum length. Defaults to 1.

1
maxl int

Maximum length. Defaults to 500000000.

500000000
gc tuple

GC content range (min, max). Defaults to (0, 1).

(0, 1)
valid_chars str

Allowed characters. Defaults to "ACGTN".

'ACGTN'

Returns:

Name Type Description
bool bool

True if valid, False otherwise.

Source code in dnallm/utils/sequence.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def check_sequence(
    seq: str,
    minl: int = 1,
    maxl: int = 500000000,
    gc: tuple = (0, 1),
    valid_chars: str = "ACGTN",
) -> bool:
    """Check if a DNA sequence is valid based on length, GC content, and
    allowed characters.

    Args:
        seq (str): DNA sequence.
        minl (int, optional): Minimum length. Defaults to 1.
        maxl (int, optional): Maximum length. Defaults to 500000000.
        gc (tuple, optional): GC content range (min, max). Defaults to (0, 1).
        valid_chars (str, optional): Allowed characters. Defaults to "ACGTN".

    Returns:
        bool: True if valid, False otherwise.
    """
    if len(seq) < minl or len(seq) > maxl:
        return False  # 序列长度不在有效范围内
    elif gc[0] > calc_gc_content(seq) or gc[1] < calc_gc_content(seq):
        return False  # GC含量不在有效范围内
    elif set(seq.upper()) - set(valid_chars) != set():
        return False  # 序列包含不支持的字符
    else:
        return True  # 序列有效

random_generate_sequences

random_generate_sequences(
    minl,
    maxl=0,
    samples=1,
    gc=(0, 1),
    n_ratio=0.0,
    padding_size=0,
    seed=None,
)

Randomly generate DNA sequences with specified length, GC content, and N ratio.

Parameters:

Name Type Description Default
minl int

Minimum sequence length.

required
maxl int

Maximum sequence length. If 0, use minl as fixed length. Defaults to 0.

0
samples int

Number of sequences to generate. Defaults to 1.

1
gc tuple

GC content range (min, max). Defaults to (0, 1).

(0, 1)
N_ratio float

Proportion of 'N' bases (0.0 ~ 1.0). Defaults to 0.0.

required
padding_size int

Pad length to nearest multiple. Defaults to 0.

0
seed int

Random seed. Defaults to None.

None

Returns:

Type Description
list[str]

list[str]: List of generated DNA sequences.

Source code in dnallm/utils/sequence.py
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
def random_generate_sequences(
    minl: int,
    maxl: int = 0,
    samples: int = 1,
    gc: tuple = (0, 1),
    n_ratio: float = 0.0,
    padding_size: int = 0,
    seed: int | None = None,
) -> list[str]:
    """Randomly generate DNA sequences with specified length, GC content,
    and N ratio.

    Args:
        minl (int): Minimum sequence length.
        maxl (int, optional): Maximum sequence length. If 0, use minl as
            fixed length. Defaults to 0.
        samples (int, optional): Number of sequences to generate.
            Defaults to 1.
        gc (tuple, optional): GC content range (min, max).
            Defaults to (0, 1).
        N_ratio (float, optional): Proportion of 'N' bases (0.0 ~ 1.0).
            Defaults to 0.0.
        padding_size (int, optional): Pad length to nearest multiple.
            Defaults to 0.
        seed (int, optional): Random seed. Defaults to None.

    Returns:
        list[str]: List of generated DNA sequences.
    """
    sequences: list[str] = []
    basemap = ["A", "C", "G", "T"]
    if 0.0 < n_ratio <= 1.0:
        basemap.append("N")
        weights = [(1 - n_ratio) / 4] * 4 + [n_ratio]
    elif n_ratio > 1.0:
        basemap.append("N")
        weights = [(100 - n_ratio) / 4] * 4 + [n_ratio]
    else:
        weights = None
    calc_gc = (
        False if gc == (0, 1) else True
    )  # Guanqing Please check this line!
    if seed:
        random.seed(seed)
    # progress bar
    progress_bar = tqdm(total=samples, desc="Generating sequences")
    # generate sequences
    if maxl:
        # generate sequences with random length
        while True:
            if len(sequences) >= samples:
                break
            length = random.randint(minl, maxl)  # noqa: S311
            if padding_size:
                length = (
                    (length // padding_size + 1) * padding_size
                    if length % padding_size
                    else length
                )
                if length > maxl:
                    length -= padding_size
            seq = "".join(
                random.choices(  # noqa: S311
                    basemap,
                    weights=weights,
                    k=length,
                )
            )
            if calc_gc:
                gc_content = calc_gc_content(seq)
                if gc[0] <= gc_content <= gc[1]:
                    sequences.append(seq)
                    progress_bar.update(1)
            else:
                sequences.append(seq)
                progress_bar.update(1)
    # generate sequences with fixed length
    else:
        maxl = minl
        length = minl
        while True:
            if len(sequences) >= samples:
                break
            seq = "".join(
                random.choices(  # noqa: S311
                    basemap,
                    weights=weights,
                    k=length,
                )
            )
            # calculate GC content
            if calc_gc:
                gc_content = calc_gc_content(seq)
                if gc[0] <= gc_content <= gc[1]:
                    sequences.append(seq)
                    progress_bar.update(1)
            else:
                sequences.append(seq)
                progress_bar.update(1)
    return sequences

reverse_complement

reverse_complement(seq, reverse=True, complement=True)

Compute the reverse complement of a DNA sequence.

Parameters:

Name Type Description Default
seq str

DNA sequence.

required
reverse bool

Whether to reverse the sequence. Defaults to True.

True
complement bool

Whether to complement the sequence. Defaults to True.

True

Returns:

Name Type Description
str str

The reverse complement (or as specified) of the input sequence.

Source code in dnallm/utils/sequence.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def reverse_complement(
    seq: str, reverse: bool = True, complement: bool = True
) -> str:
    """Compute the reverse complement of a DNA sequence.

    Args:
        seq (str): DNA sequence.
        reverse (bool, optional): Whether to reverse the sequence.
            Defaults to True.
        complement (bool, optional): Whether to complement the sequence.
            Defaults to True.

    Returns:
        str: The reverse complement (or as specified) of the input sequence.
    """
    mapping = {
        "A": "T",
        "T": "A",
        "C": "G",
        "G": "C",
        "a": "t",
        "t": "a",
        "c": "g",
        "g": "c",
        "N": "N",
        "n": "n",
    }
    if reverse:
        seq = seq[::-1]
    if complement:
        seq = "".join(mapping.get(base, base) for base in seq)
    return seq

seq2kmer

seq2kmer(seqs, k)

Convert a list of DNA sequences to k-mers (overlapping k-mer tokenization).

Parameters:

Name Type Description Default
seqs list[str]

List of DNA sequences.

required
k int

k-mer length.

required

Returns:

Type Description
list[str]

list[str]: List of k-mer tokenized sequences (space-separated k-mers).

Source code in dnallm/utils/sequence.py
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def seq2kmer(seqs: list[str], k: int) -> list[str]:
    """Convert a list of DNA sequences to k-mers (overlapping k-mer
    tokenization).

    Args:
        seqs (list[str]): List of DNA sequences.
        k (int): k-mer length.

    Returns:
        list[str]: List of k-mer tokenized sequences (space-separated
            k-mers).
    """
    all_kmers = []
    for seq in seqs:
        kmer = [seq[x : x + k].upper() for x in range(len(seq) + 1 - k)]
        kmers = " ".join(kmer)
        all_kmers.append(kmers)
    return all_kmers