Skip to content

inference/benchmark API

dnallm.inference.benchmark

DNA Language Model Benchmarking Module.

This module provides comprehensive benchmarking capabilities for DNA language models, including performance evaluation, metrics calculation, and result visualization.

Classes

Benchmark

Benchmark(config)

Class for benchmarking DNA Language Models.

This class provides methods to evaluate the performance of different DNA language models on various tasks, including classification, regression, and token classification.

Attributes:
    config: Configuration dictionary containing task settings and
        inference parameters
    all_models: Dictionary mapping source names to sets of available
        model names
    dataset: The dataset used for benchmarking

Initialize the Benchmark class.

Parameters:

Name Type Description Default
config dict

Configuration object containing task settings and inference parameters

required
Source code in dnallm/inference/benchmark.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def __init__(self, config: dict):
    """Initialize the Benchmark class.

    Args:
        config: Configuration object containing task settings and
            inference parameters
    """
    self.config = config
    self.all_models = {
        "huggingface": set(
            np.concatenate([
                MODEL_INFO[m]["huggingface"] for m in MODEL_INFO
            ]).tolist()
        ),
        "modelscope": set(
            np.concatenate([
                MODEL_INFO[m]["modelscope"] for m in MODEL_INFO
            ]).tolist()
        ),
    }
    self.datasets: list[str] = []
    # Load preset benchmark configuration if available
    if "benchmark" in config:
        self.prepared = self.__load_from_config()

    else:
        self.prepared = None
Functions
__load_from_config
__load_from_config()

Load the benchmark-specific parameters from the configuration.

Source code in dnallm/inference/benchmark.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def __load_from_config(self):
    """Load the benchmark-specific parameters from the configuration."""
    benchmark_config = self.config["benchmark"]
    config_path = os.path.dirname(benchmark_config.config_path)
    models = benchmark_config.models
    model_names = {m.name: m.path for m in models}
    sources = [m.source if hasattr(m, "source") else None for m in models]
    self.config["inference"] = InferenceConfig
    for k, v in dict(benchmark_config.evaluation).items():
        setattr(self.config["inference"], k, v)
    self.config["inference"].output_dir = benchmark_config.output.path
    if hasattr(benchmark_config, "datasets"):
        datasets = benchmark_config.datasets
        task_configs = []
        for d in datasets:
            self.config["task"] = TaskConfig
            self.config["task"].task_type = d.task
            self.config["task"].num_labels = d.num_labels
            self.config["task"].label_names = d.label_names
            self.config["task"].threshold = d.threshold
            data_path = (
                d.path
                if os.path.isfile(d.path)
                else os.path.abspath(config_path + "/" + d.path)
            )
            self.get_dataset(data_path, d.text_column, d.label_column)
            task_configs.append(self.config["task"])
    else:
        datasets = []
        task_configs = [self.config["task"]]
    metrics = benchmark_config.metrics
    plot_format = benchmark_config.output.format
    return {
        "models": model_names,
        "sources": sources,
        "tasks": task_configs,
        "dataset": datasets,
        "metrics": metrics,
        "plot_format": plot_format,
    }
available_models
available_models(show_all=True)

List all available models.

Parameters:

Name Type Description Default
show_all bool

If True, show all models. If False,

True
    show only the models that are available

Returns:

Type Description
dict[str, Any]

List of model names if show_all=True,

    otherwise dictionary mapping model names to tags
Source code in dnallm/inference/benchmark.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def available_models(self, show_all: bool = True) -> dict[str, Any]:
    """List all available models.

    Args:
                    show_all: If True, show all models. If False,
            show only the models that are available

    Returns:
                    List of model names if show_all=True,
            otherwise dictionary mapping model names to tags
    """
    # Load the model information
    if show_all:
        return MODEL_INFO
    else:
        models_list = {m: MODEL_INFO[m]["model_tags"] for m in MODEL_INFO}
        return models_list
get_dataset
get_dataset(
    seq_or_path, seq_col="sequence", label_col="labels"
)

Load the dataset from the specified path or list of sequences.

Parameters:

Name Type Description Default
seq_or_path str | list[str]

Path to the sequence file or list of sequences

required
seq_col str

Column name for DNA sequences, default "sequence"

'sequence'
label_col str

Column name for labels, default "labels"

'labels'

Returns:

Name Type Description
DNADataset DNADataset

Dataset object containing the sequences and labels

Source code in dnallm/inference/benchmark.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def get_dataset(
    self,
    seq_or_path: str | list[str],
    seq_col: str = "sequence",
    label_col: str = "labels",
) -> DNADataset:
    """Load the dataset from the specified path or list of sequences.

    Args:
        seq_or_path: Path to the sequence file or list of sequences
        seq_col: Column name for DNA sequences, default "sequence"
        label_col: Column name for labels, default "labels"

    Returns:
        DNADataset: Dataset object containing the sequences and labels
    """
    inference_engine = DNAInference(
        model=None, tokenizer=None, config=self.config
    )
    ds, _ = inference_engine.generate_dataset(
        seq_or_path=seq_or_path,
        seq_col=seq_col,
        label_col=label_col,
        keep_seqs=False,
        do_encode=False,
    )
    self.datasets.append(ds.dataset)
    return ds
get_inference_engine
get_inference_engine(model, tokenizer)

Create an inference engine object for the model.

Parameters:

Name Type Description Default
model

The model to be used for inference

required
tokenizer

The tokenizer to be used for encoding sequences

required

Returns:

Name Type Description
DNAInference DNAInference

The inference engine object configured with the given model and tokenizer

Source code in dnallm/inference/benchmark.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def get_inference_engine(self, model, tokenizer) -> DNAInference:
    """Create an inference engine object for the model.

    Args:
        model: The model to be used for inference
        tokenizer: The tokenizer to be used for encoding sequences

    Returns:
        DNAInference: The inference engine object configured with the given
            model and tokenizer
    """

    inference_engine = DNAInference(
        model=model, tokenizer=tokenizer, config=self.config
    )
    return inference_engine
plot
plot(
    metrics,
    show_score=True,
    save_path=None,
    separate=False,
    dataset=0,
)

Plot the benchmark results.

This method generates various types of plots based on the task type: - For classification tasks: bar charts for metrics and ROC curves - For regression tasks: bar charts for metrics and scatter plots - For token classification: bar charts for metrics only

Parameters:

Name Type Description Default
metrics dict

Dictionary containing model metrics

required
show_score bool

Whether to show the score on the plot save_path: Path to save the plot. If None, plots will be shown interactively

True
separate bool

Whether to save the plots separately

False

Returns:

Type Description
None

None

Source code in dnallm/inference/benchmark.py
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
def plot(
    self,
    metrics: dict,
    show_score: bool = True,
    save_path: str | None = None,
    separate: bool = False,
    dataset: int | str = 0,
) -> None:
    """Plot the benchmark results.

    This method generates various types of plots based on the task type:
    - For classification tasks: bar charts for metrics and ROC curves
    - For regression tasks: bar charts for metrics and scatter plots
    - For token classification: bar charts for metrics only

    Args:
        metrics: Dictionary containing model metrics
        show_score: Whether to show the score on the plot
                    save_path: Path to save the plot. If None,
            plots will be shown interactively
        separate: Whether to save the plots separately

    Returns:
        None
    """
    task_config = self.config["task"]
    task_type = task_config.task_type
    # Select dataset if multiple datasets are provided
    if isinstance(dataset, int):
        metrics = metrics[list(metrics.keys())[dataset]]
    elif isinstance(dataset, str):
        if dataset in metrics:
            metrics = metrics[dataset]
        else:
            raise ValueError(
                f"Dataset name '{dataset}' not found in metrics."
            )
    else:
        metrics = metrics[next(iter(metrics.keys()))]
    if task_type in ["binary", "multiclass", "multilabel", "token"]:
        # Prepare data for plotting
        bars_data, curves_data = prepare_data(metrics, task_type=task_type)
        if save_path:
            suffix = os.path.splitext(save_path)[-1]
            if suffix:
                bar_chart = save_path.replace(suffix, "_metrics" + suffix)
                line_chart = save_path.replace(suffix, "_roc" + suffix)
            else:
                bar_chart = os.path.join(save_path, "metrics.pdf")
                line_chart = os.path.join(save_path, "roc.pdf")
        else:
            bar_chart = None
            line_chart = None
        # Plot bar charts
        pbar = plot_bars(
            bars_data,
            show_score=show_score,
            save_path=bar_chart,
            separate=separate,
        )
        # Plot curve charts
        if task_type == "token":
            pline = None
        else:
            pline = plot_curve(
                curves_data,
                show_score=show_score,
                save_path=line_chart,
                separate=separate,
            )
        return pbar, pline
    elif task_type == "regression":
        # Prepare data for plotting
        bars_data, scatter_data = prepare_data(
            metrics, task_type=task_type
        )
        if save_path:
            suffix = os.path.splitext(save_path)[-1]
            if suffix:
                bar_chart = save_path.replace(suffix, "_metrics" + suffix)
                scatter_plot = save_path.replace(
                    suffix, "_scatter" + suffix
                )
            else:
                bar_chart = os.path.join(save_path, "metrics.pdf")
                scatter_plot = os.path.join(save_path, "scatter.pdf")
        else:
            bar_chart = None
        # Plot bar charts
        pbar = plot_bars(
            bars_data,
            show_score=show_score,
            save_path=bar_chart,
            separate=separate,
        )
        # Plot scatter plots
        pdot = plot_scatter(
            scatter_data,
            show_score=show_score,
            save_path=scatter_plot,
            separate=separate,
        )
        return pbar, pdot
run
run(
    model_names=None,
    source="huggingface",
    use_mirror=False,
    save_preds=False,
    save_scores=True,
)

Perform the benchmark evaluation on multiple models.

    This method loads each model, runs predictions on the dataset,
calculates metrics,

and optionally saves the results.

Parameters:

Name Type Description Default
model_names list[str] | dict | None

List of model names or

None
    a dictionary mapping model names to paths
source: Source of the models ('local', 'huggingface', 'modelscope')
use_mirror: Whether to use a mirror for downloading models
save_preds: Whether to save the predictions
save_scores: Whether to save the metrics

Returns:

Type Description
dict[str, Any]

dict[str, Any]: Dictionary containing benchmark results for each dataset and model

Raises:

Type Description
NameError

If model cannot be found in either the given source or local storage

Source code in dnallm/inference/benchmark.py
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
def run(
    self,
    model_names: list[str] | dict | None = None,
    source: str = "huggingface",
    use_mirror: bool = False,
    save_preds: bool = False,
    save_scores: bool = True,
) -> dict[str, Any]:
    """Perform the benchmark evaluation on multiple models.

            This method loads each model, runs predictions on the dataset,
        calculates metrics,
    and optionally saves the results.

    Args:
                    model_names: List of model names or
            a dictionary mapping model names to paths
        source: Source of the models ('local', 'huggingface', 'modelscope')
        use_mirror: Whether to use a mirror for downloading models
        save_preds: Whether to save the predictions
        save_scores: Whether to save the metrics

    Returns:
        dict[str, Any]: Dictionary containing benchmark results for each
            dataset and model

    Raises:
        NameError: If model cannot be found in either the given source or
            local storage
    """
    all_results: dict[str, Any] = {}
    selected_results: dict[str, Any] = {}
    metrics_save: dict[str, Any] = {}
    if self.prepared:
        task_configs = self.prepared["tasks"]
        pred_config = self.config["inference"]
        # Get datasets and model names from preset config
        dataset_names = [d.name for d in self.prepared["dataset"]]
        model_names = self.prepared["models"]
        sources = [s if s else source for s in self.prepared["sources"]]
        selected_metrics = self.prepared["metrics"]
    else:
        # Get configurations from the provided config
        task_configs = [self.config["task"]]
        pred_config = self.config["inference"]
        # Get dataset and model names manually
        dataset_names = ["custom"]
        sources = [source] * len(model_names)
        selected_metrics = []
        # Load the dataset
    for di, dname in enumerate(dataset_names):
        print("Dataset name:", dname)
        all_results[dname] = {}
        selected_results[dname] = {}
        metrics_save[dname] = {}
        labels = self.datasets[di]["labels"]
        task_config = (
            task_configs[di] if di < len(task_configs) else task_configs[0]
        )
        for mi, model_name in enumerate(model_names):
            print("Model name:", model_name)
            # Check if the model name is provided as a string or a
            # dictionary
            if isinstance(model_names, dict):
                model_path = model_names[model_name]
            else:
                model_path = model_name
                model_name = os.path.basename(model_name)
            # Check if the model is local or remote
            source = sources[mi]
            # Load the model and tokenizer
            if source == "local":
                model, tokenizer = load_model_and_tokenizer(
                    model_path, task_config=task_config
                )
            else:
                # Check if the model is available in the model library
                # if model_path not in self.all_models[source]:
                # print(f"Model \'{model_path}\' not found in our available
                # models list.")
                #     continue
                # try:
                model, tokenizer = load_model_and_tokenizer(
                    model_path,
                    task_config=task_config,
                    source=source,
                    use_mirror=use_mirror,
                )
                # except Exception:
                #     if os.path.exists(model_path):
                #         model, tokenizer = load_model_and_tokenizer(
                #             model_path, task_config=task_config
                #         )
                #     else:
                #         raise NameError(
                # "Cannot find model in either the given source or local."
                #         ) from None
            dataset = DNADataset(
                self.datasets[di],
                tokenizer=tokenizer,
                max_length=pred_config.max_length,
            )
            dataset.encode_sequences(remove_unused_columns=True)
            dataloader: DataLoader = DataLoader(
                dataset,
                batch_size=pred_config.batch_size,
                num_workers=pred_config.num_workers,
            )
            inference_engine = self.get_inference_engine(model, tokenizer)
            # Perform the prediction
            logits, _, _ = inference_engine.batch_infer(
                dataloader, do_pred=False
            )
            if len(labels) == len(logits):
                metrics = inference_engine.calculate_metrics(
                    logits, labels, plot=True
                )
                all_results[dname][model_name] = metrics
                # keep selected metrics
                if selected_metrics:
                    selected_results[dname][model_name] = {}
                    if "all" in selected_metrics:
                        selected_results[dname][model_name] = metrics
                    else:
                        for metric in selected_metrics:
                            selected_results[dname][model_name][metric] = (
                                all_results[dname][model_name][metric]
                            )
                else:
                    selected_results[dname][model_name] = metrics
                # keep all metrics if save_scores is True
                if save_scores:
                    metrics2 = dict(metrics)
                    metrics2.pop("curve", None)
                    metrics2.pop("scatter", None)
                    metrics_save[dname][model_name] = metrics2
    # Save the metrics
    if save_scores and pred_config.output_dir:
        save_metrics(metrics_save, Path(pred_config.output_dir))
    if self.prepared:
        return selected_results
    else:
        return all_results

Functions