Skip to content

utils/sequence API

Sequence utility functions for DNA sequence analysis and generation.

This module provides functions for: - Calculating GC content - Generating reverse complements - Converting sequences to k-mers - Validating DNA sequences - Randomly generating DNA sequences with constraints

All functions are designed for use in DNA language modeling and bioinformatics pipelines.

calc_gc_content(seq)

Calculate the GC content of a DNA sequence.

Parameters:

Name Type Description Default
seq str

DNA sequence (A/C/G/T/U/N, case-insensitive).

required

Returns:

Name Type Description
float float

GC content (0.0 ~ 1.0). Returns 0.0 if sequence is empty.

Source code in dnallm/utils/sequence.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def calc_gc_content(seq: str) -> float:
    """
    Calculate the GC content of a DNA sequence.

    Args:
        seq (str): DNA sequence (A/C/G/T/U/N, case-insensitive).

    Returns:
        float: GC content (0.0 ~ 1.0). Returns 0.0 if sequence is empty.
    """
    seq = seq.upper().replace("U", "T").replace("N", "")
    try:
        gc = (seq.count("G") + seq.count("C")) / len(seq)
    except ZeroDivisionError as e:
        print(e)
        gc = 0.0
    return gc

check_sequence(seq, minl=1, maxl=500000000, gc=(0, 1), valid_chars='ACGTN')

Check if a DNA sequence is valid based on length, GC content, and allowed characters.

Parameters:

Name Type Description Default
seq str

DNA sequence.

required
minl int

Minimum length. Defaults to 1.

1
maxl int

Maximum length. Defaults to 500000000.

500000000
gc tuple

GC content range (min, max). Defaults to (0, 1).

(0, 1)
valid_chars str

Allowed characters. Defaults to "ACGTN".

'ACGTN'

Returns:

Name Type Description
bool bool

True if valid, False otherwise.

Source code in dnallm/utils/sequence.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def check_sequence(
    seq: str,
    minl: int = 1,
    maxl: int = 500000000,
    gc: tuple = (0, 1),
    valid_chars: str = "ACGTN"
) -> bool:
    """
    Check if a DNA sequence is valid based on length, GC content, and allowed characters.

    Args:
        seq (str): DNA sequence.
        minl (int, optional): Minimum length. Defaults to 1.
        maxl (int, optional): Maximum length. Defaults to 500000000.
        gc (tuple, optional): GC content range (min, max). Defaults to (0, 1).
        valid_chars (str, optional): Allowed characters. Defaults to "ACGTN".

    Returns:
        bool: True if valid, False otherwise.
    """
    if len(seq) < minl or len(seq) > maxl:
        return False  # 序列长度不在有效范围内
    elif gc[0] > calc_gc_content(seq) or gc[1] < calc_gc_content(seq):
        return False  # GC含量不在有效范围内
    elif set(seq.upper()) - set(valid_chars) != set():
        return False  # 序列包含不支持的字符
    else:
        return True  # 序列有效

random_generate_sequences(minl, maxl=0, samples=1, gc=(0, 1), N_ratio=0.0, padding_size=0, seed=None)

Randomly generate DNA sequences with specified length, GC content, and N ratio.

Parameters:

Name Type Description Default
minl int

Minimum sequence length.

required
maxl int

Maximum sequence length. If 0, use minl as fixed length. Defaults to 0.

0
samples int

Number of sequences to generate. Defaults to 1.

1
gc tuple

GC content range (min, max). Defaults to (0, 1).

(0, 1)
N_ratio float

Proportion of 'N' bases (0.0 ~ 1.0). Defaults to 0.0.

0.0
padding_size int

Pad length to nearest multiple. Defaults to 0.

0
seed int

Random seed. Defaults to None.

None

Returns:

Type Description
list[str]

list[str]: List of generated DNA sequences.

Source code in dnallm/utils/sequence.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
def random_generate_sequences(
    minl: int,
    maxl: int = 0,
    samples: int = 1,
    gc: tuple = (0, 1),
    N_ratio: float = 0.0,
    padding_size: int = 0,
    seed: int = None
) -> list[str]:
    """
    Randomly generate DNA sequences with specified length, GC content, and N ratio.

    Args:
        minl (int): Minimum sequence length.
        maxl (int, optional): Maximum sequence length. If 0, use minl as fixed length. Defaults to 0.
        samples (int, optional): Number of sequences to generate. Defaults to 1.
        gc (tuple, optional): GC content range (min, max). Defaults to (0, 1).
        N_ratio (float, optional): Proportion of 'N' bases (0.0 ~ 1.0). Defaults to 0.0.
        padding_size (int, optional): Pad length to nearest multiple. Defaults to 0.
        seed (int, optional): Random seed. Defaults to None.

    Returns:
        list[str]: List of generated DNA sequences.
    """
    sequences = []
    basemap = ["A", "C", "G", "T"]
    if 0.0 < N_ratio <= 1.0:
        basemap.append("N")
        weights = [(1 - N_ratio) / 4] * 4 + [N_ratio]
    elif N_ratio > 1.0:
        basemap.append("N")
        weights = [(100 - N_ratio) / 4] * 4 + [N_ratio]
    else:
        weights = None
    calc_gc = False if gc == (0, 1) else True # Guanqing Please check this line!
    if seed:
        random.seed(seed)
    # progress bar
    progress_bar = tqdm(total=samples, desc="Generating sequences")
    # generate sequences
    if maxl:
        # generate sequences with random length
        while True:
            if len(sequences) >= samples:
                break
            length = random.randint(minl, maxl)
            if padding_size:
                length = (length // padding_size + 1) * padding_size if length % padding_size else length
                if length > maxl:
                    length -= padding_size
            seq = "".join(random.choices(basemap, weights=weights, k=length))
            if calc_gc:
                gc_content = calc_gc_content(seq)
                if gc[0] <= gc_content <= gc[1]:
                    sequences.append(seq)
                    progress_bar.update(1)
            else:
                sequences.append(seq)
                progress_bar.update(1)
    # generate sequences with fixed length
    else:
        maxl = minl
        length = minl
        while True:
            if len(sequences) >= samples:
                break
            seq = "".join(random.choices(basemap, weights=weights, k=length))
            # calculate GC content
            if calc_gc:
                gc_content = calc_gc_content(seq)
                if gc[0] <= gc_content <= gc[1]:
                    sequences.append(seq)
                    progress_bar.update(1)
            else:
                sequences.append(seq)
                progress_bar.update(1)
    return sequences

reverse_complement(seq, reverse=True, complement=True)

Compute the reverse complement of a DNA sequence.

Parameters:

Name Type Description Default
seq str

DNA sequence.

required
reverse bool

Whether to reverse the sequence. Defaults to True.

True
complement bool

Whether to complement the sequence. Defaults to True.

True

Returns:

Name Type Description
str str

The reverse complement (or as specified) of the input sequence.

Source code in dnallm/utils/sequence.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def reverse_complement(seq: str, reverse: bool = True, complement: bool = True) -> str:
    """
    Compute the reverse complement of a DNA sequence.

    Args:
        seq (str): DNA sequence.
        reverse (bool, optional): Whether to reverse the sequence. Defaults to True.
        complement (bool, optional): Whether to complement the sequence. Defaults to True.

    Returns:
        str: The reverse complement (or as specified) of the input sequence.
    """
    mapping = {
        "A": "T", "T": "A", "C": "G", "G": "C",
        "a": "t", "t": "a", "c": "g", "g": "c",
        "N": "N", "n": "n"
    }
    if reverse:
        seq = seq[::-1]
    if complement:
        seq = "".join(mapping.get(base, base) for base in seq)
    return seq

seq2kmer(seqs, k)

Convert a list of DNA sequences to k-mers (overlapping k-mer tokenization).

Parameters:

Name Type Description Default
seqs list[str]

List of DNA sequences.

required
k int

k-mer length.

required

Returns:

Type Description
list[str]

list[str]: List of k-mer tokenized sequences (space-separated k-mers).

Source code in dnallm/utils/sequence.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def seq2kmer(seqs: list[str], k: int) -> list[str]:
    """
    Convert a list of DNA sequences to k-mers (overlapping k-mer tokenization).

    Args:
        seqs (list[str]): List of DNA sequences.
        k (int): k-mer length.

    Returns:
        list[str]: List of k-mer tokenized sequences (space-separated k-mers).
    """
    all_kmers = []
    for seq in seqs:
        kmer = [seq[x:x+k].upper() for x in range(len(seq)+1-k)]
        kmers = " ".join(kmer)
        all_kmers.append(kmers)
    return all_kmers