LoRA Fine-tuning
In [1]:
Copied!
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
In [2]:
Copied!
from dnallm import load_config, load_model_and_tokenizer, DNADataset, DNATrainer
from dnallm import load_config, load_model_and_tokenizer, DNADataset, DNATrainer
In [3]:
Copied!
# Load the config file
configs = load_config("./finetune_config.yaml")
# Load the config file
configs = load_config("./finetune_config.yaml")
In [4]:
Copied!
# Load the model and tokenizer
model_name = "kuleshov-group/PlantCAD2-Small-l24-d0768"
# from ModelScope
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="huggingface")
# Load the model and tokenizer
model_name = "kuleshov-group/PlantCAD2-Small-l24-d0768"
# from ModelScope
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="huggingface")
Fetching 10 files: 0%| | 0/10 [00:00<?, ?it/s]
15:31:42 - dnallm.utils.support - INFO - Model files are stored in /home/liuguanqing/.cache/huggingface/hub/models--kuleshov-group--PlantCAD2-Small-l24-d0768/snapshots/f756c255cb76e9f538c3acec04acf4214ed03fb3
Some weights of the model checkpoint at /home/liuguanqing/.cache/huggingface/hub/models--kuleshov-group--PlantCAD2-Small-l24-d0768/snapshots/f756c255cb76e9f538c3acec04acf4214ed03fb3 were not used when initializing CaduceusForSequenceClassification: ['lm_head.complement_map', 'lm_head.lm_head.weight'] - This IS expected if you are initializing CaduceusForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing CaduceusForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some weights of CaduceusForSequenceClassification were not initialized from the model checkpoint at /home/liuguanqing/.cache/huggingface/hub/models--kuleshov-group--PlantCAD2-Small-l24-d0768/snapshots/f756c255cb76e9f538c3acec04acf4214ed03fb3 and are newly initialized: ['score.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
In [5]:
Copied!
# Load the datasets
data_name = "zhangtaolab/plant-multi-species-core-promoters"
# from ModelScope
datasets = DNADataset.from_modelscope(data_name, seq_col="sequence", label_col="label", tokenizer=tokenizer, max_length=512)
# downsampling datasets
sampled_datasets = datasets.sampling(0.05, overwrite=True)
# Encode the datasets
sampled_datasets.encode_sequences(remove_unused_columns=True)
# Load the datasets
data_name = "zhangtaolab/plant-multi-species-core-promoters"
# from ModelScope
datasets = DNADataset.from_modelscope(data_name, seq_col="sequence", label_col="label", tokenizer=tokenizer, max_length=512)
# downsampling datasets
sampled_datasets = datasets.sampling(0.05, overwrite=True)
# Encode the datasets
sampled_datasets.encode_sequences(remove_unused_columns=True)
2026-03-24 15:31:44,151 - modelscope - INFO - Not logged-in, you can login for uploadingor accessing controlled entities.
Encoding inputs: 0%| | 0/3328 [00:00<?, ? examples/s]
Encoding inputs: 0%| | 0/416 [00:00<?, ? examples/s]
Encoding inputs: 0%| | 0/416 [00:00<?, ? examples/s]
In [6]:
Copied!
# Initialize the trainer with lora
trainer = DNATrainer(
model=model,
config=configs,
datasets=sampled_datasets,
use_lora=True # load lora config from the configs
)
# Initialize the trainer with lora
trainer = DNATrainer(
model=model,
config=configs,
datasets=sampled_datasets,
use_lora=True # load lora config from the configs
)
[Info] Applying LoRA to the model... trainable params: 2,419,200 || all params: 178,395,264 || trainable%: 1.3561
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
In [7]:
Copied!
# Start training
metrics = trainer.train()
print(metrics)
# Start training
metrics = trainer.train()
print(metrics)
[416/416 02:43, Epoch 2/2]
| Step | Training Loss | Validation Loss | Accuracy | Precision | Recall | F1 | Mcc | Auroc | Auprc | Tpr | Tnr | Fpr | Fnr |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 100 | 0.691100 | 0.690458 | 0.612981 | 0.576512 | 0.794118 | 0.668041 | 0.248565 | 0.660088 | 0.653642 | 0.794118 | 0.438679 | 0.561321 | 0.205882 |
| 200 | 0.683700 | 0.677969 | 0.588942 | 0.545455 | 0.970588 | 0.698413 | 0.288297 | 0.769908 | 0.765890 | 0.970588 | 0.221698 | 0.778302 | 0.029412 |
| 300 | 0.652300 | 0.652287 | 0.649038 | 0.590062 | 0.931373 | 0.722433 | 0.369038 | 0.789389 | 0.787482 | 0.931373 | 0.377358 | 0.622642 | 0.068627 |
| 400 | 0.629800 | 0.641151 | 0.675481 | 0.621053 | 0.867647 | 0.723926 | 0.385537 | 0.791782 | 0.799368 | 0.867647 | 0.490566 | 0.509434 | 0.132353 |
{'train_runtime': 182.509, 'train_samples_per_second': 36.469, 'train_steps_per_second': 2.279, 'total_flos': 3647563722915840.0, 'train_loss': 0.6615940974308894, 'epoch': 2.0}
In [ ]:
Copied!