Custom Head
In [1]:
Copied!
from dnallm import load_config, load_model_and_tokenizer, DNADataset, DNATrainer
from dnallm import load_config, load_model_and_tokenizer, DNADataset, DNATrainer
Finetune with a custom classification head (for binary classification)¶
In [2]:
Copied!
# Load the config file
configs = load_config("./finetune_config.yaml")
# Load the config file
configs = load_config("./finetune_config.yaml")
In [3]:
Copied!
# Load the model and tokenizer
model_name = "zhangtaolab/plant-dnagpt-BPE"
# from ModelScope
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")
# Load the model and tokenizer
model_name = "zhangtaolab/plant-dnagpt-BPE"
# from ModelScope
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")
Downloading Model from https://www.modelscope.cn to directory: /Users/forrest/.cache/modelscope/hub/models/zhangtaolab/plant-dnagpt-BPE 14:36:48 - dnallm.utils.support - INFO - Model files are stored in /Users/forrest/.cache/modelscope/hub/models/zhangtaolab/plant-dnagpt-BPE 14:36:50 - dnallm.utils.support - WARNING - Warning: Could not determine model type, falling back to 'mean' pooling. 14:36:50 - dnallm.utils.support - INFO - Using mean pooling strategy.
In [4]:
Copied!
# Load the datasets
data_name = "zhangtaolab/plant-multi-species-core-promoters"
# from Hugging Face
# datasets = DNADataset.from_huggingface(data_name, seq_col="sequence", label_col="label", tokenizer=tokenizer, max_length=512)
# from ModelScope
datasets = DNADataset.from_modelscope(data_name, seq_col="sequence", label_col="label", tokenizer=tokenizer, max_length=512)
# sample datasets
sampled_datasets = datasets.sampling(0.1, overwrite=True)
# Encode the datasets
sampled_datasets.encode_sequences()
# Load the datasets
data_name = "zhangtaolab/plant-multi-species-core-promoters"
# from Hugging Face
# datasets = DNADataset.from_huggingface(data_name, seq_col="sequence", label_col="label", tokenizer=tokenizer, max_length=512)
# from ModelScope
datasets = DNADataset.from_modelscope(data_name, seq_col="sequence", label_col="label", tokenizer=tokenizer, max_length=512)
# sample datasets
sampled_datasets = datasets.sampling(0.1, overwrite=True)
# Encode the datasets
sampled_datasets.encode_sequences()
Encoding inputs: 0%| | 0/6656 [00:00<?, ? examples/s]
Encoding inputs: 0%| | 0/832 [00:00<?, ? examples/s]
Encoding inputs: 0%| | 0/832 [00:00<?, ? examples/s]
In [5]:
Copied!
# Initialize the trainer
trainer = DNATrainer(
model=model,
config=configs,
datasets=sampled_datasets
)
# Initialize the trainer
trainer = DNATrainer(
model=model,
config=configs,
datasets=sampled_datasets
)
In [6]:
Copied!
# Start training
metrics = trainer.train()
print(metrics)
# Start training
metrics = trainer.train()
print(metrics)
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
[417/417 1:11:52, Epoch 3/3]
| Step | Training Loss | Validation Loss | Accuracy | Precision | Recall | F1 | Mcc | Auroc | Auprc | Tpr | Tnr | Fpr | Fnr |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 100 | 0.564600 | 0.477763 | 0.771635 | 0.808933 | 0.742597 | 0.774347 | 0.546099 | 0.856063 | 0.856556 | 0.742597 | 0.804071 | 0.195929 | 0.257403 |
| 200 | 0.446100 | 0.466926 | 0.782452 | 0.753937 | 0.872437 | 0.808870 | 0.567577 | 0.864404 | 0.859758 | 0.872437 | 0.681934 | 0.318066 | 0.127563 |
| 300 | 0.414200 | 0.460737 | 0.786058 | 0.772443 | 0.842825 | 0.806100 | 0.571194 | 0.868826 | 0.865918 | 0.842825 | 0.722646 | 0.277354 | 0.157175 |
| 400 | 0.361500 | 0.469775 | 0.776442 | 0.784270 | 0.794989 | 0.789593 | 0.551212 | 0.864665 | 0.860681 | 0.794989 | 0.755725 | 0.244275 | 0.205011 |
{'train_runtime': 4322.1369, 'train_samples_per_second': 4.62, 'train_steps_per_second': 0.096, 'total_flos': 5298226681872384.0, 'train_loss': 0.44137911144778025, 'epoch': 3.0}
In [7]:
Copied!
# Do prediction on the test set
results = trainer.infer()
results.metrics
# Do prediction on the test set
results = trainer.infer()
results.metrics
Out[7]:
{'test_loss': 0.4892328679561615,
'test_accuracy': 0.7608173076923077,
'test_precision': 0.7344632768361582,
'test_recall': 0.8705357142857143,
'test_f1': 0.7967313585291114,
'test_mcc': 0.522206944653848,
'test_AUROC': 0.8532772972470237,
'test_AUPRC': 0.8636071401874013,
'test_TPR': 0.8705357142857143,
'test_TNR': 0.6328125,
'test_FPR': 0.3671875,
'test_FNR': 0.12946428571428573,
'test_runtime': 24.9902,
'test_samples_per_second': 33.293,
'test_steps_per_second': 0.72}
Model that is not compatible with the Transformer library (megaDNA)¶
In [8]:
Copied!
# Change head config in the config file
configs['task'].head_config.head = "megadna"
# Change saved model path
configs['finetune'].output_dir = "./outputs_megadna"
# Change head config in the config file
configs['task'].head_config.head = "megadna"
# Change saved model path
configs['finetune'].output_dir = "./outputs_megadna"
In [9]:
Copied!
# Load the model and tokenizer
model_name = "lingxusb/megaDNA_updated"
# from Hugging Face
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="huggingface")
# from ModelScope
# model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")
# Load the model and tokenizer
model_name = "lingxusb/megaDNA_updated"
# from Hugging Face
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="huggingface")
# from ModelScope
# model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
megaDNA_phage_145M.pt: 0%| | 0.00/582M [00:00<?, ?B/s]
README.md: 0%| | 0.00/29.0 [00:00<?, ?B/s]
.gitattributes: 0.00B [00:00, ?B/s]
15:51:13 - dnallm.utils.support - INFO - Model files are stored in /Users/forrest/.cache/huggingface/hub/models--lingxusb--megaDNA_updated/snapshots/ed298be539e1667b52a1181a6472528a34dd2ef9
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) File ~/GitHub/DNALLM/dnallm/models/special/megadna.py:128, in _handle_megadna_models(model_name, source, head_config, extra) 125 downloaded_model_path = os.path.join( 126 downloaded_model_path, full_model_name 127 ) --> 128 megadna_model = torch.load( 129 downloaded_model_path, weights_only=False 130 ) 131 megadna_tokenizer = DNATokenizer() File ~/GitHub/DNALLM/.venv/lib/python3.12/site-packages/torch/serialization.py:1471, in load(f, map_location, pickle_module, weights_only, mmap, **pickle_load_args) 1470 raise pickle.UnpicklingError(_get_wo_message(str(e))) from None -> 1471 return _load( 1472 opened_zipfile, 1473 map_location, 1474 pickle_module, 1475 overall_storage=overall_storage, 1476 **pickle_load_args, 1477 ) 1478 if mmap: File ~/GitHub/DNALLM/.venv/lib/python3.12/site-packages/torch/serialization.py:1964, in _load(zip_file, map_location, pickle_module, pickle_file, overall_storage, **pickle_load_args) 1963 _serialization_tls.map_location = map_location -> 1964 result = unpickler.load() 1965 _serialization_tls.map_location = None File ~/GitHub/DNALLM/.venv/lib/python3.12/site-packages/torch/serialization.py:1953, in _load.<locals>.UnpicklerWrapper.find_class(self, mod_name, name) 1952 mod_name = load_module_mapping.get(mod_name, mod_name) -> 1953 return super().find_class(mod_name, name) ModuleNotFoundError: No module named 'megaDNA' The above exception was the direct cause of the following exception: ImportError Traceback (most recent call last) Cell In[9], line 4 2 model_name = "lingxusb/megaDNA_updated" 3 # from Hugging Face ----> 4 model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="huggingface") 5 # from ModelScope 6 # model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope") File ~/GitHub/DNALLM/dnallm/models/model.py:855, in load_model_and_tokenizer(model_name, task_config, source, use_mirror, revision, custom_tokenizer) 852 _ = _handle_gpn_models(model_name) 854 # Handle special case for megaDNA models --> 855 megadna_result = _handle_megadna_models(model_name, source, head_config) 856 if megadna_result is not None: 857 return megadna_result File ~/GitHub/DNALLM/dnallm/models/special/megadna.py:147, in _handle_megadna_models(model_name, source, head_config, extra) 142 megadna_model = DNALLMforSequenceClassification( 143 config=model_config, 144 custom_model=megadna_model, 145 ) 146 except ImportError as e: --> 147 raise ImportError( 148 f"megaDNA package is required for " 149 f"{model_name} but not installed. " 150 "Please install it following the instructions at: " 151 "https://github.com/lingxusb/megaDNA" 152 ) from e 154 return megadna_model, megadna_tokenizer 156 return None ImportError: megaDNA package is required for lingxusb/megaDNA_updated but not installed. Please install it following the instructions at: https://github.com/lingxusb/megaDNA
In [ ]:
Copied!
# Load the datasets
datasets = DNADataset.from_modelscope(data_name, seq_col="sequence", label_col="label", tokenizer=tokenizer, max_length=1024)
sampled_datasets = datasets.sampling(0.1, overwrite=True)
sampled_datasets.encode_sequences()
# Load the datasets
datasets = DNADataset.from_modelscope(data_name, seq_col="sequence", label_col="label", tokenizer=tokenizer, max_length=1024)
sampled_datasets = datasets.sampling(0.1, overwrite=True)
sampled_datasets.encode_sequences()
Encoding inputs: 0%| | 0/6656 [00:00<?, ? examples/s]
Encoding inputs: 0%| | 0/832 [00:00<?, ? examples/s]
Encoding inputs: 0%| | 0/832 [00:00<?, ? examples/s]
In [ ]:
Copied!
# Initialize the trainer
trainer = DNATrainer(
model=model,
config=configs,
datasets=sampled_datasets
)
# Initialize the trainer
trainer = DNATrainer(
model=model,
config=configs,
datasets=sampled_datasets
)
In [ ]:
Copied!
# Start training
metrics = trainer.train()
print(metrics)
# Start training
metrics = trainer.train()
print(metrics)
/home/liuguanqing/miniforge3/lib/python3.12/contextlib.py:105: FutureWarning: `torch.backends.cuda.sdp_kernel()` is deprecated. In the future, this context manager will be removed. Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, with updated signature. self.gen = func(*args, **kwds)
[417/417 01:54, Epoch 3/3]
| Step | Training Loss | Validation Loss | Accuracy | Precision | Recall | F1 | Mcc | Auroc | Auprc | Tpr | Tnr | Fpr | Fnr |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 100 | 0.699400 | 0.674930 | 0.576923 | 0.584821 | 0.612150 | 0.598174 | 0.152141 | 0.606470 | 0.613311 | 0.612150 | 0.539604 | 0.460396 | 0.387850 |
| 200 | 0.675700 | 0.667276 | 0.587740 | 0.592191 | 0.637850 | 0.614173 | 0.173450 | 0.626203 | 0.625642 | 0.637850 | 0.534653 | 0.465347 | 0.362150 |
| 300 | 0.646800 | 0.674051 | 0.597356 | 0.662021 | 0.443925 | 0.531469 | 0.214306 | 0.664928 | 0.658847 | 0.443925 | 0.759901 | 0.240099 | 0.556075 |
| 400 | 0.618900 | 0.667258 | 0.610577 | 0.668831 | 0.481308 | 0.559783 | 0.236859 | 0.673285 | 0.666627 | 0.481308 | 0.747525 | 0.252475 | 0.518692 |
/home/liuguanqing/miniforge3/lib/python3.12/contextlib.py:105: FutureWarning: `torch.backends.cuda.sdp_kernel()` is deprecated. In the future, this context manager will be removed. Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, with updated signature. self.gen = func(*args, **kwds) /home/liuguanqing/miniforge3/lib/python3.12/contextlib.py:105: FutureWarning: `torch.backends.cuda.sdp_kernel()` is deprecated. In the future, this context manager will be removed. Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, with updated signature. self.gen = func(*args, **kwds) /home/liuguanqing/miniforge3/lib/python3.12/contextlib.py:105: FutureWarning: `torch.backends.cuda.sdp_kernel()` is deprecated. In the future, this context manager will be removed. Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, with updated signature. self.gen = func(*args, **kwds) /home/liuguanqing/miniforge3/lib/python3.12/contextlib.py:105: FutureWarning: `torch.backends.cuda.sdp_kernel()` is deprecated. In the future, this context manager will be removed. Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, with updated signature. self.gen = func(*args, **kwds)
{'train_runtime': 114.7143, 'train_samples_per_second': 174.067, 'train_steps_per_second': 3.635, 'total_flos': 1.8043379178799104e+16, 'train_loss': 0.6572908154494471, 'epoch': 3.0}
In [ ]:
Copied!
# Do prediction on the test set
results = trainer.infer()
results.metrics
# Do prediction on the test set
results = trainer.infer()
results.metrics
/home/liuguanqing/miniforge3/lib/python3.12/contextlib.py:105: FutureWarning: `torch.backends.cuda.sdp_kernel()` is deprecated. In the future, this context manager will be removed. Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, with updated signature. self.gen = func(*args, **kwds)
Out[ ]:
{'test_loss': 0.6516980528831482,
'test_accuracy': 0.6213942307692307,
'test_precision': 0.6920289855072463,
'test_recall': 0.45368171021377673,
'test_f1': 0.5480631276901005,
'test_mcc': 0.26214204444019135,
'test_AUROC': 0.6950950985661528,
'test_AUPRC': 0.6844861256476427,
'test_TPR': 0.45368171021377673,
'test_TNR': 0.7931873479318735,
'test_FPR': 0.20681265206812652,
'test_FNR': 0.5463182897862233,
'test_runtime': 1.2112,
'test_samples_per_second': 686.915,
'test_steps_per_second': 14.861}
In [ ]:
Copied!