LoRA Inference

In [1]:

Copied!

from dnallm import load_config, load_model_and_tokenizer
from dnallm import DNAInference
from dnallm import load_config, load_model_and_tokenizer
from dnallm import DNAInference

In [2]:

Copied!

# Load the config file
configs = load_config("inference_config.yaml")
# Load the config file
configs = load_config("inference_config.yaml")

In [ ]:

Copied!





# Load the model and tokenizer
model_name = "kuleshov-group/PlantCAD2-Small-l24-d0768"
# from Hugging Face
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="huggingface")
# from ModelScope
# model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")
# Load the model and tokenizer
model_name = "kuleshov-group/PlantCAD2-Small-l24-d0768"
# from Hugging Face
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="huggingface")
# from ModelScope
# model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")

Downloading Model from https://www.modelscope.cn to directory: /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/PlantCAD2-Small-l24-d0768
16:51:20 - dnallm.models.model - INFO - Model files are stored in /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/PlantCAD2-Small-l24-d0768
Downloading Model from https://www.modelscope.cn to directory: /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/PlantCAD2-Small-l24-d0768
Downloading Model from https://www.modelscope.cn to directory: /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/PlantCAD2-Small-l24-d0768

Some weights of the model checkpoint at /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/PlantCAD2-Small-l24-d0768 were not used when initializing CaduceusForSequenceClassification: ['lm_head.complement_map', 'lm_head.lm_head.weight']
- This IS expected if you are initializing CaduceusForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CaduceusForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CaduceusForSequenceClassification were not initialized from the model checkpoint at /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/PlantCAD2-Small-l24-d0768 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

In [ ]:

Copied!





# Create inference engine
lora_adapter_path = "plantcad/cross_species_acr_train_on_arabidopsis_plantcad2_small"
inference_engine = DNAInference(
    model=model,
    tokenizer=tokenizer,
    config=configs,
    lora_adapter=lora_adapter_path
)
# Create inference engine
lora_adapter_path = "plantcad/cross_species_acr_train_on_arabidopsis_plantcad2_small"
inference_engine = DNAInference(
    model=model,
    tokenizer=tokenizer,
    config=configs,
    lora_adapter=lora_adapter_path
)

Downloading Model from https://www.modelscope.cn to directory: /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/cross_species_acr_train_on_arabidopsis_plantcad2_small

2025-09-19 16:51:24,567 - modelscope - INFO - Got 8 files, start to download ...

Processing 8 items:   0%|          | 0.00/8.00 [00:00<?, ?it/s]

Downloading [adapter_config.json]:   0%|          | 0.00/781 [00:00<?, ?B/s]

Downloading [optimizer.pt]:   0%|          | 0.00/11.8M [00:00<?, ?B/s]

Downloading [rng_state.pth]:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

Downloading [adapter_model.safetensors]:   0%|          | 0.00/9.26M [00:00<?, ?B/s]

Downloading [README.md]:   0%|          | 0.00/5.02k [00:00<?, ?B/s]

Downloading [trainer_state.json]:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

Downloading [scheduler.pt]:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

Downloading [training_args.bin]:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

2025-09-19 16:51:29,935 - modelscope - INFO - Download model 'lgq12697/cross_species_acr_train_on_arabidopsis_plantcad2_small' successfully.

16:51:29 - dnallm.models.model - INFO - Model files are stored in /home/liuguanqing/.cache/modelscope/hub/models/lgq12697/cross_species_acr_train_on_arabidopsis_plantcad2_small
16:51:30 - dnallm.models.model - INFO - Loaded LoRA adapter from lgq12697/cross_species_acr_train_on_arabidopsis_plantcad2_small
16:51:30 - dnallm.models.model - INFO - Using device: cuda

In [5]:

Copied!





seqs = [(
    "AAAAATTTAAATATCGTCTGTAGATATTTTATGGGATGCTTTGAGAATGGGCTTCGTTTTAATGGGCCTC"
    "CTCTGCAATCATTGTCCAGAGTCGAGAAACCACCTCTTCTTCTCTTGTTCTTTCTCCAAATCGATTTGGT"
    "CCCAACTCTCTTCAAGCAAAGGAGAGATATGAAAATGAAAGCTCTTACGGCGAACAAGTTTTTCCGATTG"
    "AAGAAGAGAAGAATCTAGAAGATGAAGACAACACTAGTGCACCAAACAGTTTTGCGCGTCTTGAGAGGAA"
    "ACAAAAAACTATTCAGAGTTCAGAGAGAGTCAACCCCCAAACGAGACTTAAACGATGAGCCCACTATAAT"
    "TTTATAATTTATGGGCCATCAGGCCCAAATGATCAGTAGTAGTTATTATTTGACTTTTGACATGGTGGAT"
    "TTGGTTTAACCACCAAACCGAACGAGTAAAACACTATTGGATTGGGTGATGATATCCCGGTTTTATTTGG"
    "TTAAAATCACAAAATCCTGATTTTGGTTCGCGGCTTGATTCTGCCGCTCTCTCGTCTTTAACCTAACTAA"
    "AGACGTAGAATGATTCTGGTTATTGAATTAGTTTGATACA"
)]
results = inference_engine.infer_seqs(seqs)
seqs = [(
    "AAAAATTTAAATATCGTCTGTAGATATTTTATGGGATGCTTTGAGAATGGGCTTCGTTTTAATGGGCCTC"
    "CTCTGCAATCATTGTCCAGAGTCGAGAAACCACCTCTTCTTCTCTTGTTCTTTCTCCAAATCGATTTGGT"
    "CCCAACTCTCTTCAAGCAAAGGAGAGATATGAAAATGAAAGCTCTTACGGCGAACAAGTTTTTCCGATTG"
    "AAGAAGAGAAGAATCTAGAAGATGAAGACAACACTAGTGCACCAAACAGTTTTGCGCGTCTTGAGAGGAA"
    "ACAAAAAACTATTCAGAGTTCAGAGAGAGTCAACCCCCAAACGAGACTTAAACGATGAGCCCACTATAAT"
    "TTTATAATTTATGGGCCATCAGGCCCAAATGATCAGTAGTAGTTATTATTTGACTTTTGACATGGTGGAT"
    "TTGGTTTAACCACCAAACCGAACGAGTAAAACACTATTGGATTGGGTGATGATATCCCGGTTTTATTTGG"
    "TTAAAATCACAAAATCCTGATTTTGGTTCGCGGCTTGATTCTGCCGCTCTCTCGTCTTTAACCTAACTAA"
    "AGACGTAGAATGATTCTGGTTATTGAATTAGTTTGATACA"
)]
results = inference_engine.infer_seqs(seqs)

Encoding inputs:   0%|          | 0/1 [00:00<?, ? examples/s]

Inferring: 100%|██████████| 1/1 [00:06<00:00,  6.34s/it]

In [6]:

Copied!

print(results)
print(results)

{0: {'sequence': 'AAAAATTTAAATATCGTCTGTAGATATTTTATGGGATGCTTTGAGAATGGGCTTCGTTTTAATGGGCCTCCTCTGCAATCATTGTCCAGAGTCGAGAAACCACCTCTTCTTCTCTTGTTCTTTCTCCAAATCGATTTGGTCCCAACTCTCTTCAAGCAAAGGAGAGATATGAAAATGAAAGCTCTTACGGCGAACAAGTTTTTCCGATTGAAGAAGAGAAGAATCTAGAAGATGAAGACAACACTAGTGCACCAAACAGTTTTGCGCGTCTTGAGAGGAAACAAAAAACTATTCAGAGTTCAGAGAGAGTCAACCCCCAAACGAGACTTAAACGATGAGCCCACTATAATTTTATAATTTATGGGCCATCAGGCCCAAATGATCAGTAGTAGTTATTATTTGACTTTTGACATGGTGGATTTGGTTTAACCACCAAACCGAACGAGTAAAACACTATTGGATTGGGTGATGATATCCCGGTTTTATTTGGTTAAAATCACAAAATCCTGATTTTGGTTCGCGGCTTGATTCTGCCGCTCTCTCGTCTTTAACCTAACTAAAGACGTAGAATGATTCTGGTTATTGAATTAGTTTGATACA', 'label': 'positive', 'scores': {'negative': 0.37261253595352173, 'positive': 0.6273874640464783}}}

In [10]:

Copied!





infer_file = "./test.csv"
results, metrics = inference_engine.infer_file(
    infer_file, seq_col="sequence", label_col="label", evaluate=True
)
infer_file = "./test.csv"
results, metrics = inference_engine.infer_file(
    infer_file, seq_col="sequence", label_col="label", evaluate=True
)

Generating train split: 0 examples [00:00, ? examples/s]

Format labels:   0%|          | 0/250 [00:00<?, ? examples/s]

Encoding inputs:   0%|          | 0/250 [00:00<?, ? examples/s]

Inferring: 100%|██████████| 16/16 [00:14<00:00,  1.13it/s]

In [11]:

Copied!

for i, res in results.items():
    print(res['label'], res['scores'], sep="\n")
    break

print(metrics)
for i, res in results.items():
    print(res['label'], res['scores'], sep="\n")
    break

print(metrics)

negative
{'negative': 0.9992390871047974, 'positive': 0.0007609084714204073}
{'accuracy': 0.776, 'precision': 1.0, 'recall': 0.06666666666666667, 'f1': 0.125, 'mcc': 0.2269152152350059, 'AUROC': 0.8207017543859649, 'AUPRC': 0.6004600585479547, 'TPR': 0.06666666666666667, 'TNR': 1.0, 'FPR': 0.0, 'FNR': 0.9333333333333333}

In [ ]: