Custom Head

In [1]:

Copied!

from dnallm import load_config, load_model_and_tokenizer, DNADataset, DNATrainer
from dnallm import load_config, load_model_and_tokenizer, DNADataset, DNATrainer

Finetune with a custom classification head (for binary classification)¶

In [2]:

Copied!

# Load the config file
configs = load_config("./finetune_config.yaml")
# Load the config file
configs = load_config("./finetune_config.yaml")

In [3]:

Copied!





# Load the model and tokenizer
model_name = "zhangtaolab/plant-dnagpt-BPE"
# from ModelScope
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")
# Load the model and tokenizer
model_name = "zhangtaolab/plant-dnagpt-BPE"
# from ModelScope
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")

Downloading Model from https://www.modelscope.cn to directory: /Users/forrest/.cache/modelscope/hub/models/zhangtaolab/plant-dnagpt-BPE
14:36:48 - dnallm.utils.support - INFO - Model files are stored in /Users/forrest/.cache/modelscope/hub/models/zhangtaolab/plant-dnagpt-BPE
14:36:50 - dnallm.utils.support - WARNING - Warning: Could not determine model type, falling back to 'mean' pooling.
14:36:50 - dnallm.utils.support - INFO - Using mean pooling strategy.

In [4]:

Copied!





# Load the datasets
data_name = "zhangtaolab/plant-multi-species-core-promoters"
# from Hugging Face
# datasets = DNADataset.from_huggingface(data_name, seq_col="sequence", label_col="label", tokenizer=tokenizer, max_length=512)
# from ModelScope
datasets = DNADataset.from_modelscope(data_name, seq_col="sequence", label_col="label", tokenizer=tokenizer, max_length=512)

# sample datasets
sampled_datasets = datasets.sampling(0.1, overwrite=True)

# Encode the datasets
sampled_datasets.encode_sequences()
# Load the datasets
data_name = "zhangtaolab/plant-multi-species-core-promoters"
# from Hugging Face
# datasets = DNADataset.from_huggingface(data_name, seq_col="sequence", label_col="label", tokenizer=tokenizer, max_length=512)
# from ModelScope
datasets = DNADataset.from_modelscope(data_name, seq_col="sequence", label_col="label", tokenizer=tokenizer, max_length=512)

# sample datasets
sampled_datasets = datasets.sampling(0.1, overwrite=True)

# Encode the datasets
sampled_datasets.encode_sequences()

Encoding inputs:   0%|          | 0/6656 [00:00<?, ? examples/s]

Encoding inputs:   0%|          | 0/832 [00:00<?, ? examples/s]

Encoding inputs:   0%|          | 0/832 [00:00<?, ? examples/s]

In [5]:

Copied!





# Initialize the trainer
trainer = DNATrainer(
    model=model,
    config=configs,
    datasets=sampled_datasets
)
# Initialize the trainer
trainer = DNATrainer(
    model=model,
    config=configs,
    datasets=sampled_datasets
)

In [6]:

Copied!

# Start training
metrics = trainer.train()
print(metrics)
# Start training
metrics = trainer.train()
print(metrics)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.

[417/417 1:11:52, Epoch 3/3]

Step	Training Loss	Validation Loss	Accuracy	Precision	Recall	F1	Mcc	Auroc	Auprc	Tpr	Tnr	Fpr	Fnr
100	0.564600	0.477763	0.771635	0.808933	0.742597	0.774347	0.546099	0.856063	0.856556	0.742597	0.804071	0.195929	0.257403
200	0.446100	0.466926	0.782452	0.753937	0.872437	0.808870	0.567577	0.864404	0.859758	0.872437	0.681934	0.318066	0.127563
300	0.414200	0.460737	0.786058	0.772443	0.842825	0.806100	0.571194	0.868826	0.865918	0.842825	0.722646	0.277354	0.157175
400	0.361500	0.469775	0.776442	0.784270	0.794989	0.789593	0.551212	0.864665	0.860681	0.794989	0.755725	0.244275	0.205011

{'train_runtime': 4322.1369, 'train_samples_per_second': 4.62, 'train_steps_per_second': 0.096, 'total_flos': 5298226681872384.0, 'train_loss': 0.44137911144778025, 'epoch': 3.0}

In [7]:

Copied!

# Do prediction on the test set
results = trainer.infer()
results.metrics
# Do prediction on the test set
results = trainer.infer()
results.metrics

Out[7]:

{'test_loss': 0.4892328679561615,
 'test_accuracy': 0.7608173076923077,
 'test_precision': 0.7344632768361582,
 'test_recall': 0.8705357142857143,
 'test_f1': 0.7967313585291114,
 'test_mcc': 0.522206944653848,
 'test_AUROC': 0.8532772972470237,
 'test_AUPRC': 0.8636071401874013,
 'test_TPR': 0.8705357142857143,
 'test_TNR': 0.6328125,
 'test_FPR': 0.3671875,
 'test_FNR': 0.12946428571428573,
 'test_runtime': 24.9902,
 'test_samples_per_second': 33.293,
 'test_steps_per_second': 0.72}

Model that is not compatible with the Transformer library (megaDNA)¶

In [8]:

Copied!





# Change head config in the config file
configs['task'].head_config.head = "megadna"
# Change saved model path
configs['finetune'].output_dir = "./outputs_megadna"
# Change head config in the config file
configs['task'].head_config.head = "megadna"
# Change saved model path
configs['finetune'].output_dir = "./outputs_megadna"

In [9]:

Copied!





# Load the model and tokenizer
model_name = "lingxusb/megaDNA_updated"
# from Hugging Face
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="huggingface")
# from ModelScope
# model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")
# Load the model and tokenizer
model_name = "lingxusb/megaDNA_updated"
# from Hugging Face
model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="huggingface")
# from ModelScope
# model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

megaDNA_phage_145M.pt:   0%|          | 0.00/582M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

15:51:13 - dnallm.utils.support - INFO - Model files are stored in /Users/forrest/.cache/huggingface/hub/models--lingxusb--megaDNA_updated/snapshots/ed298be539e1667b52a1181a6472528a34dd2ef9

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File ~/GitHub/DNALLM/dnallm/models/special/megadna.py:128, in _handle_megadna_models(model_name, source, head_config, extra)
    125 downloaded_model_path = os.path.join(
    126     downloaded_model_path, full_model_name
    127 )
--> 128 megadna_model = torch.load(
    129     downloaded_model_path, weights_only=False
    130 )
    131 megadna_tokenizer = DNATokenizer()

File ~/GitHub/DNALLM/.venv/lib/python3.12/site-packages/torch/serialization.py:1471, in load(f, map_location, pickle_module, weights_only, mmap, **pickle_load_args)
   1470                 raise pickle.UnpicklingError(_get_wo_message(str(e))) from None
-> 1471         return _load(
   1472             opened_zipfile,
   1473             map_location,
   1474             pickle_module,
   1475             overall_storage=overall_storage,
   1476             **pickle_load_args,
   1477         )
   1478 if mmap:

File ~/GitHub/DNALLM/.venv/lib/python3.12/site-packages/torch/serialization.py:1964, in _load(zip_file, map_location, pickle_module, pickle_file, overall_storage, **pickle_load_args)
   1963 _serialization_tls.map_location = map_location
-> 1964 result = unpickler.load()
   1965 _serialization_tls.map_location = None

File ~/GitHub/DNALLM/.venv/lib/python3.12/site-packages/torch/serialization.py:1953, in _load.<locals>.UnpicklerWrapper.find_class(self, mod_name, name)
   1952 mod_name = load_module_mapping.get(mod_name, mod_name)
-> 1953 return super().find_class(mod_name, name)

ModuleNotFoundError: No module named 'megaDNA'

The above exception was the direct cause of the following exception:

ImportError                               Traceback (most recent call last)
Cell In[9], line 4
      2 model_name = "lingxusb/megaDNA_updated"
      3 # from Hugging Face
----> 4 model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="huggingface")
      5 # from ModelScope
      6 # model, tokenizer = load_model_and_tokenizer(model_name, task_config=configs['task'], source="modelscope")

File ~/GitHub/DNALLM/dnallm/models/model.py:855, in load_model_and_tokenizer(model_name, task_config, source, use_mirror, revision, custom_tokenizer)
    852 _ = _handle_gpn_models(model_name)
    854 # Handle special case for megaDNA models
--> 855 megadna_result = _handle_megadna_models(model_name, source, head_config)
    856 if megadna_result is not None:
    857     return megadna_result

File ~/GitHub/DNALLM/dnallm/models/special/megadna.py:147, in _handle_megadna_models(model_name, source, head_config, extra)
    142                 megadna_model = DNALLMforSequenceClassification(
    143                     config=model_config,
    144                     custom_model=megadna_model,
    145                 )
    146         except ImportError as e:
--> 147             raise ImportError(
    148                 f"megaDNA package is required for "
    149                 f"{model_name} but not installed. "
    150                 "Please install it following the instructions at: "
    151                 "https://github.com/lingxusb/megaDNA"
    152             ) from e
    154         return megadna_model, megadna_tokenizer
    156 return None

ImportError: megaDNA package is required for lingxusb/megaDNA_updated but not installed. Please install it following the instructions at: https://github.com/lingxusb/megaDNA

In [ ]:

Copied!





# Load the datasets
datasets = DNADataset.from_modelscope(data_name, seq_col="sequence", label_col="label", tokenizer=tokenizer, max_length=1024)
sampled_datasets = datasets.sampling(0.1, overwrite=True)
sampled_datasets.encode_sequences()
# Load the datasets
datasets = DNADataset.from_modelscope(data_name, seq_col="sequence", label_col="label", tokenizer=tokenizer, max_length=1024)
sampled_datasets = datasets.sampling(0.1, overwrite=True)
sampled_datasets.encode_sequences()

Encoding inputs:   0%|          | 0/6656 [00:00<?, ? examples/s]

Encoding inputs:   0%|          | 0/832 [00:00<?, ? examples/s]

Encoding inputs:   0%|          | 0/832 [00:00<?, ? examples/s]

In [ ]:

Copied!





# Initialize the trainer
trainer = DNATrainer(
    model=model,
    config=configs,
    datasets=sampled_datasets
)
# Initialize the trainer
trainer = DNATrainer(
    model=model,
    config=configs,
    datasets=sampled_datasets
)

In [ ]:

Copied!

# Start training
metrics = trainer.train()
print(metrics)
# Start training
metrics = trainer.train()
print(metrics)

/home/liuguanqing/miniforge3/lib/python3.12/contextlib.py:105: FutureWarning: `torch.backends.cuda.sdp_kernel()` is deprecated. In the future, this context manager will be removed. Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, with updated signature.
  self.gen = func(*args, **kwds)

[417/417 01:54, Epoch 3/3]

Step	Training Loss	Validation Loss	Accuracy	Precision	Recall	F1	Mcc	Auroc	Auprc	Tpr	Tnr	Fpr	Fnr
100	0.699400	0.674930	0.576923	0.584821	0.612150	0.598174	0.152141	0.606470	0.613311	0.612150	0.539604	0.460396	0.387850
200	0.675700	0.667276	0.587740	0.592191	0.637850	0.614173	0.173450	0.626203	0.625642	0.637850	0.534653	0.465347	0.362150
300	0.646800	0.674051	0.597356	0.662021	0.443925	0.531469	0.214306	0.664928	0.658847	0.443925	0.759901	0.240099	0.556075
400	0.618900	0.667258	0.610577	0.668831	0.481308	0.559783	0.236859	0.673285	0.666627	0.481308	0.747525	0.252475	0.518692

/home/liuguanqing/miniforge3/lib/python3.12/contextlib.py:105: FutureWarning: `torch.backends.cuda.sdp_kernel()` is deprecated. In the future, this context manager will be removed. Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, with updated signature.
  self.gen = func(*args, **kwds)
/home/liuguanqing/miniforge3/lib/python3.12/contextlib.py:105: FutureWarning: `torch.backends.cuda.sdp_kernel()` is deprecated. In the future, this context manager will be removed. Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, with updated signature.
  self.gen = func(*args, **kwds)
/home/liuguanqing/miniforge3/lib/python3.12/contextlib.py:105: FutureWarning: `torch.backends.cuda.sdp_kernel()` is deprecated. In the future, this context manager will be removed. Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, with updated signature.
  self.gen = func(*args, **kwds)
/home/liuguanqing/miniforge3/lib/python3.12/contextlib.py:105: FutureWarning: `torch.backends.cuda.sdp_kernel()` is deprecated. In the future, this context manager will be removed. Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, with updated signature.
  self.gen = func(*args, **kwds)

{'train_runtime': 114.7143, 'train_samples_per_second': 174.067, 'train_steps_per_second': 3.635, 'total_flos': 1.8043379178799104e+16, 'train_loss': 0.6572908154494471, 'epoch': 3.0}

In [ ]:

Copied!

# Do prediction on the test set
results = trainer.infer()
results.metrics
# Do prediction on the test set
results = trainer.infer()
results.metrics

/home/liuguanqing/miniforge3/lib/python3.12/contextlib.py:105: FutureWarning: `torch.backends.cuda.sdp_kernel()` is deprecated. In the future, this context manager will be removed. Please see `torch.nn.attention.sdpa_kernel()` for the new context manager, with updated signature.
  self.gen = func(*args, **kwds)

Out[ ]:

{'test_loss': 0.6516980528831482,
 'test_accuracy': 0.6213942307692307,
 'test_precision': 0.6920289855072463,
 'test_recall': 0.45368171021377673,
 'test_f1': 0.5480631276901005,
 'test_mcc': 0.26214204444019135,
 'test_AUROC': 0.6950950985661528,
 'test_AUPRC': 0.6844861256476427,
 'test_TPR': 0.45368171021377673,
 'test_TNR': 0.7931873479318735,
 'test_FPR': 0.20681265206812652,
 'test_FNR': 0.5463182897862233,
 'test_runtime': 1.2112,
 'test_samples_per_second': 686.915,
 'test_steps_per_second': 14.861}

In [ ]: