Fine-tuning Data

In [1]:

Copied!

from dnallm.datahandling import *
from dnallm.datahandling import *

Use the preset dataset to quickly try the finetuning pipeline¶

In [2]:

Copied!

# Display preset datasets
show_preset_dataset()
# Display preset datasets
show_preset_dataset()

Out[2]:

{'nucleotide_transformer_downstream_tasks': {'name': 'lgq12697/nucleotide_transformer_downstream_tasks',
  'description': 'A collection of nucleotidetransformer downstream tasks datasets from NT.',
  'reference': 'https://doi.org/10.1038/s41592-024-02523-z',
  'tasks': ['enhancers',
   'enhancers_types',
   'H2AFZ',
   'H3K27ac',
   'H3K27me3',
   'H3K36me3',
   'H3K4me1',
   'H3K4me2',
   'H3K4me3',
   'H3K9ac',
   'H3K9me3',
   'H4K20me1',
   'promoter_all',
   'promoter_no_tata',
   'promoter_tata',
   'splice_sites_acceptors',
   'splice_sites_all',
   'splice_sites_donors'],
  'default_task': 'enhancers',
  'format': 'csv',
  'sequence': 'sequence',
  'label': 'label',
  'separator': ',',
  'multi_separator': ';'},
 'GUE': {'name': 'lgq12697/GUE',
  'description': 'A dataset for GenomeUnderstanding Evaluation (GUE) tasks from DNABERT-2.',
  'reference': 'https://doi.org/10.48550/arXiv.2306.15006',
  'tasks': ['emp_H3',
   'emp_H3K14ac',
   'emp_H3K36me3',
   'emp_H3K4me1',
   'emp_H3K4me2',
   'emp_H3K4me3',
   'emp_H3K79me3',
   'emp_H3K9ac',
   'emp_H4',
   'emp_H4ac',
   'EPI_GM12878',
   'EPI_HeLa-S3',
   'EPI_HUVEC',
   'EPI_IMR90',
   'EPI_K562',
   'EPI_NHEK',
   'fungi_species_20',
   'human_tf_0',
   'human_tf_1',
   'human_tf_2',
   'human_tf_3',
   'human_tf_4',
   'mouse_0',
   'mouse_1',
   'mouse_2',
   'mouse_3',
   'mouse_4',
   'phage_fragments',
   'prom_300_all',
   'prom_300_notata',
   'prom_300_tata',
   'prom_core_all',
   'prom_core_notata',
   'prom_core_tata',
   'splice_reconstructed',
   'virus_covid',
   'virus_species_40'],
  'default_task': 'prom_core_all',
  'format': 'csv',
  'sequence': 'sequence',
  'label': 'label',
  'separator': ',',
  'multi_separator': ';'},
 'plant-genomic-benchmark': {'name': 'lgq12697/plant-genomic-benchmark',
  'description': 'A benchmark dataset forplant genomic tasks from AgroNT.',
  'reference': 'https://doi.org/10.1038/s42003-024-06465-2',
  'tasks': ['poly_a.arabidopsis_thaliana',
   'poly_a.oryza_sativa_indica',
   'poly_a.oryza_sativa_japonica',
   'poly_a.chlamydomonas_reinhardtii',
   'poly_a.medicago_truncatula',
   'poly_a.trifolium_pratense',
   'splicing.arabidopsis_thaliana_acceptor',
   'splicing.arabidopsis_thaliana_donor',
   'lncrna.m_esculenta',
   'lncrna.z_mays',
   'lncrna.g_max',
   'lncrna.s_lycopersicum',
   'lncrna.t_aestivum',
   'lncrna.s_bicolor',
   'promoter_strength.leaf',
   'promoter_strength.protoplast',
   'terminator_strength.leaf',
   'terminator_strength.protoplast',
   'gene_exp.arabidopsis_thaliana',
   'gene_exp.oryza_sativa',
   'gene_exp.zea_mays',
   'gene_exp.glycine_max',
   'gene_exp.solanum_lycopersicum',
   'pro_seq.m_esculenta',
   'chromatin_access.arabidopis_thaliana',
   'chromatin_access.oryza_sativa_MH63',
   'chromatin_access.oryza_sativa_ZS97',
   'chromatin_access.brachypodium_distachyon',
   'chromatin_access.zea_mays',
   'chromatin_access.sorghum_bicolor',
   'chromatin_access.setaria_italica'],
  'default_task': 'poly_a.arabidopsis_thaliana',
  'format': 'csv',
  'sequence': 'sequence',
  'label': 'label',
  'separator': ',',
  'multi_separator': ';'}}

In [3]:

Copied!

# Load a preset dataset
dataset = load_preset_dataset(dataset_name='plant-genomic-benchmark', task='promoter_strength.leaf')
# Load a preset dataset
dataset = load_preset_dataset(dataset_name='plant-genomic-benchmark', task='promoter_strength.leaf')

Loading dataset: lgq12697/plant-genomic-benchmark ...
promoter_strength.leaf

Downloading [README.md]: 0.00B [00:00, ?B/s]

2025-12-28 14:35:58,663 - modelscope - INFO - storing https://www.modelscope.cn/api/v1/datasets/lgq12697/plant-genomic-benchmark/repo?Source=SDK&Revision=master&FilePath=README.md&View=False in cache at /Users/forrest/.cache/modelscope/hub/datasets/c59000af909fc22889b4786d4a362c0d0dc6b8e4b07094c6c0c358cfcb435f02
2025-12-28 14:35:58,664 - modelscope - INFO - creating metadata file for /Users/forrest/.cache/modelscope/hub/datasets/c59000af909fc22889b4786d4a362c0d0dc6b8e4b07094c6c0c358cfcb435f02

Downloading data:   0%|          | 0.00/12.1M [00:00<?, ?B/s]

2025-12-28 14:36:08,124 - modelscope - INFO - storing https://www.modelscope.cn/api/v1/datasets/lgq12697/plant-genomic-benchmark/repo?Source=SDK&Revision=master&FilePath=promoter_strength.leaf%2Ftrain.csv in cache at /Users/forrest/.cache/modelscope/hub/datasets/downloads/5d18dc4a80980450245c31009650f82a2db8cb9914c761a892e252f6733bb673
2025-12-28 14:36:08,125 - modelscope - INFO - creating metadata file for /Users/forrest/.cache/modelscope/hub/datasets/downloads/5d18dc4a80980450245c31009650f82a2db8cb9914c761a892e252f6733bb673

Downloading data:   0%|          | 0.00/1.49M [00:00<?, ?B/s]

2025-12-28 14:36:09,658 - modelscope - INFO - storing https://www.modelscope.cn/api/v1/datasets/lgq12697/plant-genomic-benchmark/repo?Source=SDK&Revision=master&FilePath=promoter_strength.leaf%2Ftest.csv in cache at /Users/forrest/.cache/modelscope/hub/datasets/downloads/7162681f0376bc615bdfcb777bd3306e4a622398e48c5cd637d44cf1a662d854
2025-12-28 14:36:09,659 - modelscope - INFO - creating metadata file for /Users/forrest/.cache/modelscope/hub/datasets/downloads/7162681f0376bc615bdfcb777bd3306e4a622398e48c5cd637d44cf1a662d854

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:

Copied!

# Display the dataset samples
dataset.show(head=1)
# Display the dataset samples
dataset.show(head=1)

Dataset: train
{0: {'name': 'ENSRNA049481990_Sb',
     'sequence': 'CGTTTGGGTATGGACATTTAGACTTGTCGTGTTCCTGATGCCTCCCATTCCTATGGTTCTTAGGTGCTCCTTCCTCTTCCTTTCGCTAGCGCAATTGATTTAGTGATGAACACAATATACATTCCAAAGCACATAGTTAGATGAGAGCCTGATGGCAATTGGCAAGTCAG',
     'labels': -0.2166846610255508}}
Dataset: test
{0: {'name': 'AT5G03425_At',
     'sequence': 'TGAGTGAAGGCAGAATTGACCCATGCAGCTTCCTTTCTTTCACCACTCACTTGCTAGGAAACTACAAAAATAGAAAAAGAAAACTCACGGCAACCAAAAACGCGAACTCCTAGAGGGTTTCGAACACTTTGAAATTTGTATCAGACATCAAATGAAATCTTTAACTTCTT',
     'labels': -0.5374512291279694}}

In [5]:

Copied!

# Display dataset statistics
dataset.statistics()
# Display dataset statistics
dataset.statistics()

Out[5]:

{'train': {'data_type': 'regression',
  'n_samples': 58179,
  'min_len': 170,
  'max_len': 170,
  'mean_len': 170.0,
  'median_len': 170.0},
 'test': {'data_type': 'regression',
  'n_samples': 7154,
  'min_len': 170,
  'max_len': 170,
  'mean_len': 170.0,
  'median_len': 170.0}}

In [6]:

Copied!

# Visualize dataset statistics
dataset.plot_statistics()
# Visualize dataset statistics
dataset.plot_statistics()

Successfully plotted dataset statistics.

In [7]:

Copied!





## Pre-processing of dataset
from dnallm import load_config, load_model_and_tokenizer

# Load configuration
configs = load_config("finetune_config.yaml")

# Load model and tokenizer
model, tokenizer = load_model_and_tokenizer(
    "zhangtaolab/plant-dnabert-BPE", 
    task_config=configs["task"], 
    source="modelscope"
)

# Tokenize the dataset
dataset.encode_sequences(tokenizer=tokenizer)
## Pre-processing of dataset
from dnallm import load_config, load_model_and_tokenizer

# Load configuration
configs = load_config("finetune_config.yaml")

# Load model and tokenizer
model, tokenizer = load_model_and_tokenizer(
    "zhangtaolab/plant-dnabert-BPE", 
    task_config=configs["task"], 
    source="modelscope"
)

# Tokenize the dataset
dataset.encode_sequences(tokenizer=tokenizer)

Downloading Model from https://www.modelscope.cn to directory: /Users/forrest/.cache/modelscope/hub/models/zhangtaolab/plant-dnabert-BPE
14:36:12 - dnallm.utils.support - INFO - Model files are stored in /Users/forrest/.cache/modelscope/hub/models/zhangtaolab/plant-dnabert-BPE

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /Users/forrest/.cache/modelscope/hub/models/zhangtaolab/plant-dnabert-BPE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Encoding inputs:   0%|          | 0/58179 [00:00<?, ? examples/s]

Encoding inputs:   0%|          | 0/7154 [00:00<?, ? examples/s]

In [8]:

Copied!





## Start fine-tuning
# from dnallm.finetune import DNATrainer

# trainer = DNATrainer(
#     config=configs,
#     model=model,
#     datasets=dataset
# )
# trainer.train()
## Start fine-tuning
# from dnallm.finetune import DNATrainer

# trainer = DNATrainer(
#     config=configs,
#     model=model,
#     datasets=dataset
# )
# trainer.train()

Load datasets from Hugging Face or ModelScope¶

In [9]:

Copied!

# Load tokenizer for demonstration
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("zhangtaolab/plant-dnabert-BPE")
# Load tokenizer for demonstration
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("zhangtaolab/plant-dnabert-BPE")

In [10]:

Copied!





# Load dataset from Hugging Face
dataset = DNADataset.from_huggingface(
    "zhangtaolab/plant-multi-species-core-promoters", 
    seq_col="sequence", 
    label_col="label", 
    tokenizer=tokenizer, 
    max_length=512
)
# Load dataset from Hugging Face
dataset = DNADataset.from_huggingface(
    "zhangtaolab/plant-multi-species-core-promoters", 
    seq_col="sequence", 
    label_col="label", 
    tokenizer=tokenizer, 
    max_length=512
)

In [11]:

Copied!





# Load dataset from ModelScope
dataset = DNADataset.from_modelscope(
    "zhangtaolab/plant-multi-species-core-promoters", 
    seq_col="sequence", 
    label_col="label", 
    tokenizer=tokenizer, 
    max_length=512
)
# Load dataset from ModelScope
dataset = DNADataset.from_modelscope(
    "zhangtaolab/plant-multi-species-core-promoters", 
    seq_col="sequence", 
    label_col="label", 
    tokenizer=tokenizer, 
    max_length=512
)

Load datasets from local paths¶

In [12]:

Copied!





# Load single dataset
dataset = DNADataset.load_local_data(
    "../../../../tests/test_data/regression/train.csv", 
    seq_col="sequence", 
    label_col="label",
    tokenizer=tokenizer, 
    max_length=512
)
# Load single dataset
dataset = DNADataset.load_local_data(
    "../../../../tests/test_data/regression/train.csv", 
    seq_col="sequence", 
    label_col="label",
    tokenizer=tokenizer, 
    max_length=512
)

Generating train split: 0 examples [00:00, ? examples/s]

Format labels:   0%|          | 0/500 [00:00<?, ? examples/s]

In [13]:

Copied!





# Load multiple files (e.g., pre-split datasets)
dataset = DNADataset.load_local_data(
    {
        "train": "../../../../tests/test_data/regression/train.csv", 
        "test": "../../../../tests/test_data/regression/test.csv", 
        "validation": "../../../../tests/test_data/regression/dev.csv"
    },
    seq_col="sequence", 
    label_col="label",
    tokenizer=tokenizer, 
    max_length=512
)
# Load multiple files (e.g., pre-split datasets)
dataset = DNADataset.load_local_data(
    {
        "train": "../../../../tests/test_data/regression/train.csv", 
        "test": "../../../../tests/test_data/regression/test.csv", 
        "validation": "../../../../tests/test_data/regression/dev.csv"
    },
    seq_col="sequence", 
    label_col="label",
    tokenizer=tokenizer, 
    max_length=512
)

Generating train split: 0 examples [00:00, ? examples/s]

Format labels:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Format labels:   0%|          | 0/100 [00:00<?, ? examples/s]

Manually prepare dataset for fine-tuning¶

In [14]:

Copied!





## Binary classification example
# For binary classification, labels should be 0 and 1
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,1
GCTAGCTAGCTA,0
TATATATATATA,1
'''
## Binary classification example
# For binary classification, labels should be 0 and 1
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,1
GCTAGCTAGCTA,0
TATATATATATA,1
'''

Out[14]:

'\nsequence,label\nATCGATCGATCG,1\nGCTAGCTAGCTA,0\nTATATATATATA,1\n'

In [15]:

Copied!





## Multi-class classification example
# For multi-class classification, labels should be integers starting from 0
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,0
GCTAGCTAGCTA,1
TATATATATATA,2
CGCGCGCGCGCG,3
'''
## Multi-class classification example
# For multi-class classification, labels should be integers starting from 0
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,0
GCTAGCTAGCTA,1
TATATATATATA,2
CGCGCGCGCGCG,3
'''

Out[15]:

'\nsequence,label\nATCGATCGATCG,0\nGCTAGCTAGCTA,1\nTATATATATATA,2\nCGCGCGCGCGCG,3\n'

In [16]:

Copied!





## Multi-label classification example
# For multi-label classification, labels should be a list of integers separated by specific delimiters (e.g., ";")
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,1;0;1;0;0
GCTAGCTAGCTA,0;1;0;1;0
TATATATATATA,1;1;0;0;1
'''
## Multi-label classification example
# For multi-label classification, labels should be a list of integers separated by specific delimiters (e.g., ";")
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,1;0;1;0;0
GCTAGCTAGCTA,0;1;0;1;0
TATATATATATA,1;1;0;0;1
'''

Out[16]:

'\nsequence,label\nATCGATCGATCG,1;0;1;0;0\nGCTAGCTAGCTA,0;1;0;1;0\nTATATATATATA,1;1;0;0;1\n'

In [17]:

Copied!





## Regression example
# For regression, labels should be float numbers
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,0.85
GCTAGCTAGCTA,0.23
TATATATATATA,0.67
'''
## Regression example
# For regression, labels should be float numbers
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,0.85
GCTAGCTAGCTA,0.23
TATATATATATA,0.67
'''

Out[17]:

'\nsequence,label\nATCGATCGATCG,0.85\nGCTAGCTAGCTA,0.23\nTATATATATATA,0.67\n'

In [18]:

Copied!

## Token classification example / Named Entity Recognition (NER)
# For token classification, please refer to example/notebooks/finetune_NER_task/data_generation_and_inference.ipynb
## Token classification example / Named Entity Recognition (NER)
# For token classification, please refer to example/notebooks/finetune_NER_task/data_generation_and_inference.ipynb

In [19]:

Copied!





## Masked language modeling (MLM) example
# For MLM, It only needs one column "sequence" without labels since this is a self-supervised task.
# Sequences can be extracted from FASTA files from genomic data.
'''
sequence
ATCGATCGATCG
GCTAGCTAGCTA
TATATATATATA
'''
## Masked language modeling (MLM) example
# For MLM, It only needs one column "sequence" without labels since this is a self-supervised task.
# Sequences can be extracted from FASTA files from genomic data.
'''
sequence
ATCGATCGATCG
GCTAGCTAGCTA
TATATATATATA
'''

Out[19]:

'\nsequence\nATCGATCGATCG\nGCTAGCTAGCTA\nTATATATATATA\n'

In [20]:

Copied!





## Generation example / Causal Language Modeling (CLM)
# For CLM, It needs two columns "input_sequence" and "output_sequence"
# Sequences can be extracted from FASTA files from genomic data.
'''
sequence
ATCGATCGATCG
GCTAGCTAGCTA
TATATATATATA
'''
## Generation example / Causal Language Modeling (CLM)
# For CLM, It needs two columns "input_sequence" and "output_sequence"
# Sequences can be extracted from FASTA files from genomic data.
'''
sequence
ATCGATCGATCG
GCTAGCTAGCTA
TATATATATATA
'''

Out[20]:

'\nsequence\nATCGATCGATCG\nGCTAGCTAGCTA\nTATATATATATA\n'

In [21]:

Copied!





## These sequences and labels can be saved as a CSV file for loading.
# dataset = DNADataset.load_local_data(
#     "data.csv", 
#     seq_col="sequence", 
#     label_col="label",
#     tokenizer=tokenizer, 
#     max_length=512
# )
## These sequences and labels can be saved as a CSV file for loading.
# dataset = DNADataset.load_local_data(
#     "data.csv", 
#     seq_col="sequence", 
#     label_col="label",
#     tokenizer=tokenizer, 
#     max_length=512
# )

In [ ]: