Fine-tuning Data
In [1]:
Copied!
from dnallm.datahandling import *
from dnallm.datahandling import *
Use the preset dataset to quickly try the finetuning pipeline¶
In [2]:
Copied!
# Display preset datasets
show_preset_dataset()
# Display preset datasets
show_preset_dataset()
Out[2]:
{'nucleotide_transformer_downstream_tasks': {'name': 'lgq12697/nucleotide_transformer_downstream_tasks',
'description': 'A collection of nucleotidetransformer downstream tasks datasets from NT.',
'reference': 'https://doi.org/10.1038/s41592-024-02523-z',
'tasks': ['enhancers',
'enhancers_types',
'H2AFZ',
'H3K27ac',
'H3K27me3',
'H3K36me3',
'H3K4me1',
'H3K4me2',
'H3K4me3',
'H3K9ac',
'H3K9me3',
'H4K20me1',
'promoter_all',
'promoter_no_tata',
'promoter_tata',
'splice_sites_acceptors',
'splice_sites_all',
'splice_sites_donors'],
'default_task': 'enhancers',
'format': 'csv',
'sequence': 'sequence',
'label': 'label',
'separator': ',',
'multi_separator': ';'},
'GUE': {'name': 'lgq12697/GUE',
'description': 'A dataset for GenomeUnderstanding Evaluation (GUE) tasks from DNABERT-2.',
'reference': 'https://doi.org/10.48550/arXiv.2306.15006',
'tasks': ['emp_H3',
'emp_H3K14ac',
'emp_H3K36me3',
'emp_H3K4me1',
'emp_H3K4me2',
'emp_H3K4me3',
'emp_H3K79me3',
'emp_H3K9ac',
'emp_H4',
'emp_H4ac',
'EPI_GM12878',
'EPI_HeLa-S3',
'EPI_HUVEC',
'EPI_IMR90',
'EPI_K562',
'EPI_NHEK',
'fungi_species_20',
'human_tf_0',
'human_tf_1',
'human_tf_2',
'human_tf_3',
'human_tf_4',
'mouse_0',
'mouse_1',
'mouse_2',
'mouse_3',
'mouse_4',
'phage_fragments',
'prom_300_all',
'prom_300_notata',
'prom_300_tata',
'prom_core_all',
'prom_core_notata',
'prom_core_tata',
'splice_reconstructed',
'virus_covid',
'virus_species_40'],
'default_task': 'prom_core_all',
'format': 'csv',
'sequence': 'sequence',
'label': 'label',
'separator': ',',
'multi_separator': ';'},
'plant-genomic-benchmark': {'name': 'lgq12697/plant-genomic-benchmark',
'description': 'A benchmark dataset forplant genomic tasks from AgroNT.',
'reference': 'https://doi.org/10.1038/s42003-024-06465-2',
'tasks': ['poly_a.arabidopsis_thaliana',
'poly_a.oryza_sativa_indica',
'poly_a.oryza_sativa_japonica',
'poly_a.chlamydomonas_reinhardtii',
'poly_a.medicago_truncatula',
'poly_a.trifolium_pratense',
'splicing.arabidopsis_thaliana_acceptor',
'splicing.arabidopsis_thaliana_donor',
'lncrna.m_esculenta',
'lncrna.z_mays',
'lncrna.g_max',
'lncrna.s_lycopersicum',
'lncrna.t_aestivum',
'lncrna.s_bicolor',
'promoter_strength.leaf',
'promoter_strength.protoplast',
'terminator_strength.leaf',
'terminator_strength.protoplast',
'gene_exp.arabidopsis_thaliana',
'gene_exp.oryza_sativa',
'gene_exp.zea_mays',
'gene_exp.glycine_max',
'gene_exp.solanum_lycopersicum',
'pro_seq.m_esculenta',
'chromatin_access.arabidopis_thaliana',
'chromatin_access.oryza_sativa_MH63',
'chromatin_access.oryza_sativa_ZS97',
'chromatin_access.brachypodium_distachyon',
'chromatin_access.zea_mays',
'chromatin_access.sorghum_bicolor',
'chromatin_access.setaria_italica'],
'default_task': 'poly_a.arabidopsis_thaliana',
'format': 'csv',
'sequence': 'sequence',
'label': 'label',
'separator': ',',
'multi_separator': ';'}}
In [3]:
Copied!
# Load a preset dataset
dataset = load_preset_dataset(dataset_name='plant-genomic-benchmark', task='promoter_strength.leaf')
# Load a preset dataset
dataset = load_preset_dataset(dataset_name='plant-genomic-benchmark', task='promoter_strength.leaf')
Loading dataset: lgq12697/plant-genomic-benchmark ... promoter_strength.leaf
Downloading [README.md]: 0.00B [00:00, ?B/s]
2025-12-28 14:35:58,663 - modelscope - INFO - storing https://www.modelscope.cn/api/v1/datasets/lgq12697/plant-genomic-benchmark/repo?Source=SDK&Revision=master&FilePath=README.md&View=False in cache at /Users/forrest/.cache/modelscope/hub/datasets/c59000af909fc22889b4786d4a362c0d0dc6b8e4b07094c6c0c358cfcb435f02 2025-12-28 14:35:58,664 - modelscope - INFO - creating metadata file for /Users/forrest/.cache/modelscope/hub/datasets/c59000af909fc22889b4786d4a362c0d0dc6b8e4b07094c6c0c358cfcb435f02
Downloading data: 0%| | 0.00/12.1M [00:00<?, ?B/s]
2025-12-28 14:36:08,124 - modelscope - INFO - storing https://www.modelscope.cn/api/v1/datasets/lgq12697/plant-genomic-benchmark/repo?Source=SDK&Revision=master&FilePath=promoter_strength.leaf%2Ftrain.csv in cache at /Users/forrest/.cache/modelscope/hub/datasets/downloads/5d18dc4a80980450245c31009650f82a2db8cb9914c761a892e252f6733bb673 2025-12-28 14:36:08,125 - modelscope - INFO - creating metadata file for /Users/forrest/.cache/modelscope/hub/datasets/downloads/5d18dc4a80980450245c31009650f82a2db8cb9914c761a892e252f6733bb673
Downloading data: 0%| | 0.00/1.49M [00:00<?, ?B/s]
2025-12-28 14:36:09,658 - modelscope - INFO - storing https://www.modelscope.cn/api/v1/datasets/lgq12697/plant-genomic-benchmark/repo?Source=SDK&Revision=master&FilePath=promoter_strength.leaf%2Ftest.csv in cache at /Users/forrest/.cache/modelscope/hub/datasets/downloads/7162681f0376bc615bdfcb777bd3306e4a622398e48c5cd637d44cf1a662d854 2025-12-28 14:36:09,659 - modelscope - INFO - creating metadata file for /Users/forrest/.cache/modelscope/hub/datasets/downloads/7162681f0376bc615bdfcb777bd3306e4a622398e48c5cd637d44cf1a662d854
Generating train split: 0 examples [00:00, ? examples/s]
Generating test split: 0 examples [00:00, ? examples/s]
In [4]:
Copied!
# Display the dataset samples
dataset.show(head=1)
# Display the dataset samples
dataset.show(head=1)
Dataset: train
{0: {'name': 'ENSRNA049481990_Sb',
'sequence': 'CGTTTGGGTATGGACATTTAGACTTGTCGTGTTCCTGATGCCTCCCATTCCTATGGTTCTTAGGTGCTCCTTCCTCTTCCTTTCGCTAGCGCAATTGATTTAGTGATGAACACAATATACATTCCAAAGCACATAGTTAGATGAGAGCCTGATGGCAATTGGCAAGTCAG',
'labels': -0.2166846610255508}}
Dataset: test
{0: {'name': 'AT5G03425_At',
'sequence': 'TGAGTGAAGGCAGAATTGACCCATGCAGCTTCCTTTCTTTCACCACTCACTTGCTAGGAAACTACAAAAATAGAAAAAGAAAACTCACGGCAACCAAAAACGCGAACTCCTAGAGGGTTTCGAACACTTTGAAATTTGTATCAGACATCAAATGAAATCTTTAACTTCTT',
'labels': -0.5374512291279694}}
In [5]:
Copied!
# Display dataset statistics
dataset.statistics()
# Display dataset statistics
dataset.statistics()
Out[5]:
{'train': {'data_type': 'regression',
'n_samples': 58179,
'min_len': 170,
'max_len': 170,
'mean_len': 170.0,
'median_len': 170.0},
'test': {'data_type': 'regression',
'n_samples': 7154,
'min_len': 170,
'max_len': 170,
'mean_len': 170.0,
'median_len': 170.0}}
In [6]:
Copied!
# Visualize dataset statistics
dataset.plot_statistics()
# Visualize dataset statistics
dataset.plot_statistics()
Successfully plotted dataset statistics.
In [7]:
Copied!
## Pre-processing of dataset
from dnallm import load_config, load_model_and_tokenizer
# Load configuration
configs = load_config("finetune_config.yaml")
# Load model and tokenizer
model, tokenizer = load_model_and_tokenizer(
"zhangtaolab/plant-dnabert-BPE",
task_config=configs["task"],
source="modelscope"
)
# Tokenize the dataset
dataset.encode_sequences(tokenizer=tokenizer)
## Pre-processing of dataset
from dnallm import load_config, load_model_and_tokenizer
# Load configuration
configs = load_config("finetune_config.yaml")
# Load model and tokenizer
model, tokenizer = load_model_and_tokenizer(
"zhangtaolab/plant-dnabert-BPE",
task_config=configs["task"],
source="modelscope"
)
# Tokenize the dataset
dataset.encode_sequences(tokenizer=tokenizer)
Downloading Model from https://www.modelscope.cn to directory: /Users/forrest/.cache/modelscope/hub/models/zhangtaolab/plant-dnabert-BPE 14:36:12 - dnallm.utils.support - INFO - Model files are stored in /Users/forrest/.cache/modelscope/hub/models/zhangtaolab/plant-dnabert-BPE
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /Users/forrest/.cache/modelscope/hub/models/zhangtaolab/plant-dnabert-BPE and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Encoding inputs: 0%| | 0/58179 [00:00<?, ? examples/s]
Encoding inputs: 0%| | 0/7154 [00:00<?, ? examples/s]
In [8]:
Copied!
## Start fine-tuning
# from dnallm.finetune import DNATrainer
# trainer = DNATrainer(
# config=configs,
# model=model,
# datasets=dataset
# )
# trainer.train()
## Start fine-tuning
# from dnallm.finetune import DNATrainer
# trainer = DNATrainer(
# config=configs,
# model=model,
# datasets=dataset
# )
# trainer.train()
Load datasets from Hugging Face or ModelScope¶
In [9]:
Copied!
# Load tokenizer for demonstration
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("zhangtaolab/plant-dnabert-BPE")
# Load tokenizer for demonstration
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("zhangtaolab/plant-dnabert-BPE")
In [10]:
Copied!
# Load dataset from Hugging Face
dataset = DNADataset.from_huggingface(
"zhangtaolab/plant-multi-species-core-promoters",
seq_col="sequence",
label_col="label",
tokenizer=tokenizer,
max_length=512
)
# Load dataset from Hugging Face
dataset = DNADataset.from_huggingface(
"zhangtaolab/plant-multi-species-core-promoters",
seq_col="sequence",
label_col="label",
tokenizer=tokenizer,
max_length=512
)
In [11]:
Copied!
# Load dataset from ModelScope
dataset = DNADataset.from_modelscope(
"zhangtaolab/plant-multi-species-core-promoters",
seq_col="sequence",
label_col="label",
tokenizer=tokenizer,
max_length=512
)
# Load dataset from ModelScope
dataset = DNADataset.from_modelscope(
"zhangtaolab/plant-multi-species-core-promoters",
seq_col="sequence",
label_col="label",
tokenizer=tokenizer,
max_length=512
)
Load datasets from local paths¶
In [12]:
Copied!
# Load single dataset
dataset = DNADataset.load_local_data(
"../../../../tests/test_data/regression/train.csv",
seq_col="sequence",
label_col="label",
tokenizer=tokenizer,
max_length=512
)
# Load single dataset
dataset = DNADataset.load_local_data(
"../../../../tests/test_data/regression/train.csv",
seq_col="sequence",
label_col="label",
tokenizer=tokenizer,
max_length=512
)
Generating train split: 0 examples [00:00, ? examples/s]
Format labels: 0%| | 0/500 [00:00<?, ? examples/s]
In [13]:
Copied!
# Load multiple files (e.g., pre-split datasets)
dataset = DNADataset.load_local_data(
{
"train": "../../../../tests/test_data/regression/train.csv",
"test": "../../../../tests/test_data/regression/test.csv",
"validation": "../../../../tests/test_data/regression/dev.csv"
},
seq_col="sequence",
label_col="label",
tokenizer=tokenizer,
max_length=512
)
# Load multiple files (e.g., pre-split datasets)
dataset = DNADataset.load_local_data(
{
"train": "../../../../tests/test_data/regression/train.csv",
"test": "../../../../tests/test_data/regression/test.csv",
"validation": "../../../../tests/test_data/regression/dev.csv"
},
seq_col="sequence",
label_col="label",
tokenizer=tokenizer,
max_length=512
)
Generating train split: 0 examples [00:00, ? examples/s]
Format labels: 0%| | 0/100 [00:00<?, ? examples/s]
Generating train split: 0 examples [00:00, ? examples/s]
Format labels: 0%| | 0/100 [00:00<?, ? examples/s]
Manually prepare dataset for fine-tuning¶
In [14]:
Copied!
## Binary classification example
# For binary classification, labels should be 0 and 1
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,1
GCTAGCTAGCTA,0
TATATATATATA,1
'''
## Binary classification example
# For binary classification, labels should be 0 and 1
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,1
GCTAGCTAGCTA,0
TATATATATATA,1
'''
Out[14]:
'\nsequence,label\nATCGATCGATCG,1\nGCTAGCTAGCTA,0\nTATATATATATA,1\n'
In [15]:
Copied!
## Multi-class classification example
# For multi-class classification, labels should be integers starting from 0
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,0
GCTAGCTAGCTA,1
TATATATATATA,2
CGCGCGCGCGCG,3
'''
## Multi-class classification example
# For multi-class classification, labels should be integers starting from 0
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,0
GCTAGCTAGCTA,1
TATATATATATA,2
CGCGCGCGCGCG,3
'''
Out[15]:
'\nsequence,label\nATCGATCGATCG,0\nGCTAGCTAGCTA,1\nTATATATATATA,2\nCGCGCGCGCGCG,3\n'
In [16]:
Copied!
## Multi-label classification example
# For multi-label classification, labels should be a list of integers separated by specific delimiters (e.g., ";")
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,1;0;1;0;0
GCTAGCTAGCTA,0;1;0;1;0
TATATATATATA,1;1;0;0;1
'''
## Multi-label classification example
# For multi-label classification, labels should be a list of integers separated by specific delimiters (e.g., ";")
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,1;0;1;0;0
GCTAGCTAGCTA,0;1;0;1;0
TATATATATATA,1;1;0;0;1
'''
Out[16]:
'\nsequence,label\nATCGATCGATCG,1;0;1;0;0\nGCTAGCTAGCTA,0;1;0;1;0\nTATATATATATA,1;1;0;0;1\n'
In [17]:
Copied!
## Regression example
# For regression, labels should be float numbers
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,0.85
GCTAGCTAGCTA,0.23
TATATATATATA,0.67
'''
## Regression example
# For regression, labels should be float numbers
# Data format: two columns, "sequence" and "label"
'''
sequence,label
ATCGATCGATCG,0.85
GCTAGCTAGCTA,0.23
TATATATATATA,0.67
'''
Out[17]:
'\nsequence,label\nATCGATCGATCG,0.85\nGCTAGCTAGCTA,0.23\nTATATATATATA,0.67\n'
In [18]:
Copied!
## Token classification example / Named Entity Recognition (NER)
# For token classification, please refer to example/notebooks/finetune_NER_task/data_generation_and_inference.ipynb
## Token classification example / Named Entity Recognition (NER)
# For token classification, please refer to example/notebooks/finetune_NER_task/data_generation_and_inference.ipynb
In [19]:
Copied!
## Masked language modeling (MLM) example
# For MLM, It only needs one column "sequence" without labels since this is a self-supervised task.
# Sequences can be extracted from FASTA files from genomic data.
'''
sequence
ATCGATCGATCG
GCTAGCTAGCTA
TATATATATATA
'''
## Masked language modeling (MLM) example
# For MLM, It only needs one column "sequence" without labels since this is a self-supervised task.
# Sequences can be extracted from FASTA files from genomic data.
'''
sequence
ATCGATCGATCG
GCTAGCTAGCTA
TATATATATATA
'''
Out[19]:
'\nsequence\nATCGATCGATCG\nGCTAGCTAGCTA\nTATATATATATA\n'
In [20]:
Copied!
## Generation example / Causal Language Modeling (CLM)
# For CLM, It needs two columns "input_sequence" and "output_sequence"
# Sequences can be extracted from FASTA files from genomic data.
'''
sequence
ATCGATCGATCG
GCTAGCTAGCTA
TATATATATATA
'''
## Generation example / Causal Language Modeling (CLM)
# For CLM, It needs two columns "input_sequence" and "output_sequence"
# Sequences can be extracted from FASTA files from genomic data.
'''
sequence
ATCGATCGATCG
GCTAGCTAGCTA
TATATATATATA
'''
Out[20]:
'\nsequence\nATCGATCGATCG\nGCTAGCTAGCTA\nTATATATATATA\n'
In [21]:
Copied!
## These sequences and labels can be saved as a CSV file for loading.
# dataset = DNADataset.load_local_data(
# "data.csv",
# seq_col="sequence",
# label_col="label",
# tokenizer=tokenizer,
# max_length=512
# )
## These sequences and labels can be saved as a CSV file for loading.
# dataset = DNADataset.load_local_data(
# "data.csv",
# seq_col="sequence",
# label_col="label",
# tokenizer=tokenizer,
# max_length=512
# )
In [ ]:
Copied!