LoRA Inference
In [1]:
Copied!
from dnallm import load_config, load_model_and_tokenizer
from dnallm import DNAInference
from dnallm import load_config, load_model_and_tokenizer
from dnallm import DNAInference
In [2]:
Copied!
# Load the config file
configs = load_config("inference_config.yaml")
# Load the config file
configs = load_config("inference_config.yaml")
In [ ]:
Copied!
# Load the model and tokenizer
model_name = "kuleshov-group/PlantCAD2-Small-l24-d0768"
# from Hugging Face
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="huggingface")
# from ModelScope
# model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")
# Load the model and tokenizer
model_name = "kuleshov-group/PlantCAD2-Small-l24-d0768"
# from Hugging Face
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="huggingface")
# from ModelScope
# model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")
Downloading Model from https://www.modelscope.cn to directory: /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/PlantCAD2-Small-l24-d0768 16:51:20 - dnallm.models.model - INFO - Model files are stored in /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/PlantCAD2-Small-l24-d0768 Downloading Model from https://www.modelscope.cn to directory: /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/PlantCAD2-Small-l24-d0768 Downloading Model from https://www.modelscope.cn to directory: /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/PlantCAD2-Small-l24-d0768
Some weights of the model checkpoint at /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/PlantCAD2-Small-l24-d0768 were not used when initializing CaduceusForSequenceClassification: ['lm_head.complement_map', 'lm_head.lm_head.weight'] - This IS expected if you are initializing CaduceusForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing CaduceusForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some weights of CaduceusForSequenceClassification were not initialized from the model checkpoint at /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/PlantCAD2-Small-l24-d0768 and are newly initialized: ['score.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
In [ ]:
Copied!
# Create inference engine
lora_adapter_path = "plantcad/cross_species_acr_train_on_arabidopsis_plantcad2_small"
inference_engine = DNAInference(
model=model,
tokenizer=tokenizer,
config=configs,
lora_adapter=lora_adapter_path
)
# Create inference engine
lora_adapter_path = "plantcad/cross_species_acr_train_on_arabidopsis_plantcad2_small"
inference_engine = DNAInference(
model=model,
tokenizer=tokenizer,
config=configs,
lora_adapter=lora_adapter_path
)
Downloading Model from https://www.modelscope.cn to directory: /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/cross_species_acr_train_on_arabidopsis_plantcad2_small
2025-09-19 16:51:24,567 - modelscope - INFO - Got 8 files, start to download ...
Processing 8 items: 0%| | 0.00/8.00 [00:00<?, ?it/s]
Downloading [adapter_config.json]: 0%| | 0.00/781 [00:00<?, ?B/s]
Downloading [optimizer.pt]: 0%| | 0.00/11.8M [00:00<?, ?B/s]
Downloading [rng_state.pth]: 0%| | 0.00/13.9k [00:00<?, ?B/s]
Downloading [adapter_model.safetensors]: 0%| | 0.00/9.26M [00:00<?, ?B/s]
Downloading [README.md]: 0%| | 0.00/5.02k [00:00<?, ?B/s]
Downloading [trainer_state.json]: 0%| | 0.00/11.4k [00:00<?, ?B/s]
Downloading [scheduler.pt]: 0%| | 0.00/1.04k [00:00<?, ?B/s]
Downloading [training_args.bin]: 0%| | 0.00/5.43k [00:00<?, ?B/s]
2025-09-19 16:51:29,935 - modelscope - INFO - Download model 'lgq12697/cross_species_acr_train_on_arabidopsis_plantcad2_small' successfully.
16:51:29 - dnallm.models.model - INFO - Model files are stored in /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/cross_species_acr_train_on_arabidopsis_plantcad2_small 16:51:30 - dnallm.models.model - INFO - Loaded LoRA adapter from lgq12697/cross_species_acr_train_on_arabidopsis_plantcad2_small 16:51:30 - dnallm.models.model - INFO - Using device: cuda
In [5]:
Copied!
seqs = [(
"AAAAATTTAAATATCGTCTGTAGATATTTTATGGGATGCTTTGAGAATGGGCTTCGTTTTAATGGGCCTC"
"CTCTGCAATCATTGTCCAGAGTCGAGAAACCACCTCTTCTTCTCTTGTTCTTTCTCCAAATCGATTTGGT"
"CCCAACTCTCTTCAAGCAAAGGAGAGATATGAAAATGAAAGCTCTTACGGCGAACAAGTTTTTCCGATTG"
"AAGAAGAGAAGAATCTAGAAGATGAAGACAACACTAGTGCACCAAACAGTTTTGCGCGTCTTGAGAGGAA"
"ACAAAAAACTATTCAGAGTTCAGAGAGAGTCAACCCCCAAACGAGACTTAAACGATGAGCCCACTATAAT"
"TTTATAATTTATGGGCCATCAGGCCCAAATGATCAGTAGTAGTTATTATTTGACTTTTGACATGGTGGAT"
"TTGGTTTAACCACCAAACCGAACGAGTAAAACACTATTGGATTGGGTGATGATATCCCGGTTTTATTTGG"
"TTAAAATCACAAAATCCTGATTTTGGTTCGCGGCTTGATTCTGCCGCTCTCTCGTCTTTAACCTAACTAA"
"AGACGTAGAATGATTCTGGTTATTGAATTAGTTTGATACA"
)]
results = inference_engine.infer_seqs(seqs)
seqs = [(
"AAAAATTTAAATATCGTCTGTAGATATTTTATGGGATGCTTTGAGAATGGGCTTCGTTTTAATGGGCCTC"
"CTCTGCAATCATTGTCCAGAGTCGAGAAACCACCTCTTCTTCTCTTGTTCTTTCTCCAAATCGATTTGGT"
"CCCAACTCTCTTCAAGCAAAGGAGAGATATGAAAATGAAAGCTCTTACGGCGAACAAGTTTTTCCGATTG"
"AAGAAGAGAAGAATCTAGAAGATGAAGACAACACTAGTGCACCAAACAGTTTTGCGCGTCTTGAGAGGAA"
"ACAAAAAACTATTCAGAGTTCAGAGAGAGTCAACCCCCAAACGAGACTTAAACGATGAGCCCACTATAAT"
"TTTATAATTTATGGGCCATCAGGCCCAAATGATCAGTAGTAGTTATTATTTGACTTTTGACATGGTGGAT"
"TTGGTTTAACCACCAAACCGAACGAGTAAAACACTATTGGATTGGGTGATGATATCCCGGTTTTATTTGG"
"TTAAAATCACAAAATCCTGATTTTGGTTCGCGGCTTGATTCTGCCGCTCTCTCGTCTTTAACCTAACTAA"
"AGACGTAGAATGATTCTGGTTATTGAATTAGTTTGATACA"
)]
results = inference_engine.infer_seqs(seqs)
Encoding inputs: 0%| | 0/1 [00:00<?, ? examples/s]
Inferring: 100%|██████████| 1/1 [00:06<00:00, 6.34s/it]
In [6]:
Copied!
print(results)
print(results)
{0: {'sequence': 'AAAAATTTAAATATCGTCTGTAGATATTTTATGGGATGCTTTGAGAATGGGCTTCGTTTTAATGGGCCTCCTCTGCAATCATTGTCCAGAGTCGAGAAACCACCTCTTCTTCTCTTGTTCTTTCTCCAAATCGATTTGGTCCCAACTCTCTTCAAGCAAAGGAGAGATATGAAAATGAAAGCTCTTACGGCGAACAAGTTTTTCCGATTGAAGAAGAGAAGAATCTAGAAGATGAAGACAACACTAGTGCACCAAACAGTTTTGCGCGTCTTGAGAGGAAACAAAAAACTATTCAGAGTTCAGAGAGAGTCAACCCCCAAACGAGACTTAAACGATGAGCCCACTATAATTTTATAATTTATGGGCCATCAGGCCCAAATGATCAGTAGTAGTTATTATTTGACTTTTGACATGGTGGATTTGGTTTAACCACCAAACCGAACGAGTAAAACACTATTGGATTGGGTGATGATATCCCGGTTTTATTTGGTTAAAATCACAAAATCCTGATTTTGGTTCGCGGCTTGATTCTGCCGCTCTCTCGTCTTTAACCTAACTAAAGACGTAGAATGATTCTGGTTATTGAATTAGTTTGATACA', 'label': 'positive', 'scores': {'negative': 0.37261253595352173, 'positive': 0.6273874640464783}}}
In [10]:
Copied!
infer_file = "./test.csv"
results, metrics = inference_engine.infer_file(
infer_file, seq_col="sequence", label_col="label", evaluate=True
)
infer_file = "./test.csv"
results, metrics = inference_engine.infer_file(
infer_file, seq_col="sequence", label_col="label", evaluate=True
)
Generating train split: 0 examples [00:00, ? examples/s]
Format labels: 0%| | 0/250 [00:00<?, ? examples/s]
Encoding inputs: 0%| | 0/250 [00:00<?, ? examples/s]
Inferring: 100%|██████████| 16/16 [00:14<00:00, 1.13it/s]
In [11]:
Copied!
for i, res in results.items():
print(res['label'], res['scores'], sep="\n")
break
print(metrics)
for i, res in results.items():
print(res['label'], res['scores'], sep="\n")
break
print(metrics)
negative
{'negative': 0.9992390871047974, 'positive': 0.0007609084714204073}
{'accuracy': 0.776, 'precision': 1.0, 'recall': 0.06666666666666667, 'f1': 0.125, 'mcc': 0.2269152152350059, 'AUROC': 0.8207017543859649, 'AUPRC': 0.6004600585479547, 'TPR': 0.06666666666666667, 'TNR': 1.0, 'FPR': 0.0, 'FNR': 0.9333333333333333}
In [ ]:
Copied!