Embedding
In [1]:
Copied!
#!pip install seaborn
#!pip install seaborn
In [2]:
Copied!
import sys
import random
import inspect
import numpy as np
import pandas as pd
from scipy.special import softmax
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import sys
import random
import inspect
import numpy as np
import pandas as pd
from scipy.special import softmax
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
In [3]:
Copied!
model_path = "InstaDeepAI/nucleotide-transformer-v2-50m-multi-species"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(model_path, trust_remote_code=True)
max_length = tokenizer.model_max_length
model_path = "InstaDeepAI/nucleotide-transformer-v2-50m-multi-species"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(model_path, trust_remote_code=True)
max_length = tokenizer.model_max_length
model.safetensors: 0%| | 0.00/224M [00:00<?, ?B/s]
In [4]:
Copied!
def random_generate_sequences(minl, maxl=0, samples=1, with_N=False, padding_size=0, gc=0, seed=None):
sequences = []
basemap = ["A", "C", "G", "T"]
if with_N:
basemap.append("N")
baseidx = len(basemap) - 1
if seed:
random.seed(seed)
if maxl:
for i in range(samples):
length = random.randint(minl, maxl)
if padding_size:
length = (length // padding_size + 1) * padding_size if length % padding_size else length
if length > maxl:
length -= padding_size
seq = "".join([basemap[random.randint(0,baseidx)] for _ in range(length)])
sequences.append(seq)
else:
for i in range(samples):
seq = "".join([basemap[random.randint(0,baseidx)] for _ in range(minl)])
sequences.append(seq)
return sequences
def random_generate_sequences(minl, maxl=0, samples=1, with_N=False, padding_size=0, gc=0, seed=None):
sequences = []
basemap = ["A", "C", "G", "T"]
if with_N:
basemap.append("N")
baseidx = len(basemap) - 1
if seed:
random.seed(seed)
if maxl:
for i in range(samples):
length = random.randint(minl, maxl)
if padding_size:
length = (length // padding_size + 1) * padding_size if length % padding_size else length
if length > maxl:
length -= padding_size
seq = "".join([basemap[random.randint(0,baseidx)] for _ in range(length)])
sequences.append(seq)
else:
for i in range(samples):
seq = "".join([basemap[random.randint(0,baseidx)] for _ in range(minl)])
sequences.append(seq)
return sequences
In [5]:
Copied!
# sequences = ["ATTCCGATTCCGATTCCG", "ATTTCTCTCTCTCTCTGAGATCGATCGATCGAT"]
sequences = random_generate_sequences(30, 500, 100, padding_size=6)
sequences[:10]
# sequences = ["ATTCCGATTCCGATTCCG", "ATTTCTCTCTCTCTCTGAGATCGATCGATCGAT"]
sequences = random_generate_sequences(30, 500, 100, padding_size=6)
sequences[:10]
Out[5]:
['AATCTCTGTCGACAGTTCTCGAGCTCTAGCGAGTTCCCTGATCTGGAAGTAGCGACAATCTTCCGGGTCTCATCTTTTCAAACTCGTGATCGTTGTTGGGGTCTGTGATTAGGGATGGAACCTCATACCTGCGCGTCGGTGTGTGACCTTCAATAGCTGGGAAGGCCGGTAAGGGTCTATGAGAATATATGTGTACTACGGAGAGTCGGCCACTGGTCGAGGCGACAATAATTGGGCCCTAGGATCACGACAGATCTCGTAAATAGCAGCAGAGTTACCAGGCCCATGACGTGATCCGCGTAGAAGATGATACCCGTGAATTAGGTGTTACTCTTA', 'CGAGTTTCGTATGGTAACCATCACGGGCTTAATTACCCGCAGCGCATAGTACCGTCTACCGTGACTCTAAGCACGGGTCCCCGCTTTACTATTGCCTCAGCGATTTGCACCCTGGGATGAGCCATATTACGGCAACTTGGGTCGGCAAGTATGTTCAATAGCGTACTCCTTCACTCAGTGATTGCTATGTCCAACGTGCATTTTGTCGCGGAGAGAGTCGTACAGAATTCTCGA', 'TTTGAATCTTTTATCCCTCTATTGCGCTTTAAATTGTAAGGATATTCCGTCAGACTCCCAAACATAGGCTCGATGCTGAGGTAGATTGTCATTATCTATCTTACGCAATGGTTAACCATGAGCAGTTTACTCTACGAA', 'TAGTCCCACATGCGAACATGACCCGCAGTGCGATGAGCGAGTGCTGCATGTGTCGTAAATGGAAATAAAGTGGCCTCGGCTGAGCATTAAAATGCTCAACCCAGGGTGCCTACGCTAAAATCCGAACAGCGAAGAATTACGTCGTCGCCG', 'CCATCAGAGGGAGCAAGTGGGCACAGACCACTAATTTCGTCGAATTCCGGCTCATATATGGGGATACCAGTCGCTCTTCGTGCCCATCATTATTTGCTTAGAGACTTTTCGACTGTCTCGCGGATAAGCTCAAAAGACATTAAGGGTTGTTTCCCACGACCAAGTTCGTAGAACCGCCGCATGAGATCAACTGTTGGCTACTAGGTATGGGTCAAGGGATTATGCCTTACTCTAAGTTGTAAAAAGCCCCTAAGGCCATGGCACGCTCCGTGCACCTGAAATAGCGCGTCTTGGCGGTACGAGAGACACACCCAGACGTGGCGATAGATTTGAATGTTCTCTTGATTCGCACAGCAACCTCCCGACAAATTTTGCCACTGCTTGCCCGCATGGGGTAGACATCTCACA', 'ACATAATAAGGGGGGACACACGGGAACTTGAGCTCCCAGCAGGATGCCTGGTAGGGTGCAATATAAAAAGGCCTGCAATAATTGAACTAGCCCATCTTCTTGGCGAAAGTCCAGCTATACCTCACAAGTACGTTCGGTTTTGCCCTCTGCATGGCAGGCCAG', 'GGGACGTGGTACATATTGGTCAGACGCAACCTGCGGTTTCCCTTAAGACCTATGTAGAAGGCCATAACTAGTTTTTTTTAGGGCCGGGATATCCTTAGTACTAATTCCGCAGTATTCCCAATGTTTCCCAGATCGAAGGATAGCCGTGATGCAGTATGCGTGTGATAAGGGGCCAAGTGTGTATCCCCATACTAATTATGCGCTATCGTGCCCCTTAGGCCCAAACTACGTAAATGCCCCTTAAAAAGCGCGCGCACAAAGCCGCGTATTAATATAACGAGCCCGATGCCTAGCCTATATTTTCATACGAATTTGATCAACTCAAAGGCGACAGGGGACCGTCTTTCTATAGTACCTGGATGGTTTCCTCAGCGCAGT', 'GTCCATCACGAGGGATATGTTACGGCACAGAAGGGCATATCGTTAAAACCGCTTGACGACTACCAGACTATGTAGGCTAGCTCTGTCCGGCGGCTTTCAAACGACAAACTCTTGACCACCGAGGACGACTACGTGAGGCACCTCGCCCTA', 'TGGAATCATGGTCTAGGCTTTAGGGTTGTAGCATAAGGCTCTTATTCACCTCTTATTTGGTGTGATGAGCGGAAGTTATAATAAAGAACTCCAATAACTGTGCTGCCATCGCCGCTGACGCATACCTAGGTCAGGATAGAGGATATTGGTTACGTACCCTCAGTTCTGCGTACACGTCACGATTCTAAGCTTAGTAGGTACCACATGGATTAGCTATGAGGTTATAATGGTTCCTGTGGCATTGCATTAGGGATAGGCGCCGCCACTGACACGCCAAAAGCTACGCTGCGATTCCCTATATAATTGGCGGACTGAAACTGTGCCGCCAATACTGGGGGGTCTATAATGACCATCCCTGTTAACGAGATCACACGTCTGGGAAGTGCTTGTCATATGATGCCACTACAGAGATGGGGAACTAGTCCAGACGTAGGGATGTAACAGGTTTTTGAAAACATTTTTTTAACTCAATGAGCCGATATCATCTACCCGGCGGTC', 'TAGGGTCTTATTTCCGAATTCAGACCCACGAAGCCTTTGCCCGTGTCCCCGCGTATTAGC']
In [6]:
Copied!
inputs = tokenizer(
sequences,
truncation=True, padding='longest',
max_length=max_length,
return_tensors="pt"
)
tokens_ids = inputs['input_ids'].detach()
tokens_str = [b.split() for b in tokenizer.batch_decode(tokens_ids)]
tokens_idx = [[False if s in tokenizer.all_special_tokens else True for i, s in enumerate(tokens)] for tokens in tokens_str]
inputs = tokenizer(
sequences,
truncation=True, padding='longest',
max_length=max_length,
return_tensors="pt"
)
tokens_ids = inputs['input_ids'].detach()
tokens_str = [b.split() for b in tokenizer.batch_decode(tokens_ids)]
tokens_idx = [[False if s in tokenizer.all_special_tokens else True for i, s in enumerate(tokens)] for tokens in tokens_str]
In [7]:
Copied!
# Predict
sig = inspect.signature(model.forward)
params = sig.parameters
if "output_attentions" in params:
outputs = model(
**inputs,
output_attentions=True,
output_hidden_states=True
)
else:
outputs = model(
**inputs,
output_hidden_states=True
)
# Predict
sig = inspect.signature(model.forward)
params = sig.parameters
if "output_attentions" in params:
outputs = model(
**inputs,
output_attentions=True,
output_hidden_states=True
)
else:
outputs = model(
**inputs,
output_hidden_states=True
)
Attention Map¶
In [8]:
Copied!
def plot_attention_map(attentions, tokens_str, tokens_idx, layer=-1, idx=0, ncols=3, scale_width=5, scale_height=4):
tokens = [tokens_str[idx][i] for i,b in enumerate(tokens_idx[idx]) if b]
n_heads = len(attentions)
nrows = (n_heads + ncols - 1) // ncols
figsize = (ncols * scale_width, nrows * scale_height)
fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
if n_heads == 1:
axes = [axes]
else:
axes = axes.flatten()
for i, data in enumerate(attentions):
data = data[layer][idx].detach().numpy()
data = [[data[j][jj] for jj,bb in enumerate(tokens_idx[idx]) if bb]
for j,b in enumerate(tokens_idx[idx]) if b]
sns.heatmap(
data,
ax=axes[i], cmap="viridis",
xticklabels=tokens,
yticklabels=tokens
)
axes[i].set_title(f"Head {i+1}")
for j in range(i+1, len(axes)):
fig.delaxes(axes[j])
fig.tight_layout()
plt.show()
def plot_attention_map(attentions, tokens_str, tokens_idx, layer=-1, idx=0, ncols=3, scale_width=5, scale_height=4):
tokens = [tokens_str[idx][i] for i,b in enumerate(tokens_idx[idx]) if b]
n_heads = len(attentions)
nrows = (n_heads + ncols - 1) // ncols
figsize = (ncols * scale_width, nrows * scale_height)
fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
if n_heads == 1:
axes = [axes]
else:
axes = axes.flatten()
for i, data in enumerate(attentions):
data = data[layer][idx].detach().numpy()
data = [[data[j][jj] for jj,bb in enumerate(tokens_idx[idx]) if bb]
for j,b in enumerate(tokens_idx[idx]) if b]
sns.heatmap(
data,
ax=axes[i], cmap="viridis",
xticklabels=tokens,
yticklabels=tokens
)
axes[i].set_title(f"Head {i+1}")
for j in range(i+1, len(axes)):
fig.delaxes(axes[j])
fig.tight_layout()
plt.show()
In [9]:
Copied!
# Attention map
if hasattr(outputs, 'attentions'):
attentions = outputs.attentions # ((seq_num, heads, max_token_len, max_token_len) x layers)
plot_attention_map(attentions, tokens_str, tokens_idx, layer=-1, idx=1, ncols=3)
# Attention map
if hasattr(outputs, 'attentions'):
attentions = outputs.attentions # ((seq_num, heads, max_token_len, max_token_len) x layers)
plot_attention_map(attentions, tokens_str, tokens_idx, layer=-1, idx=1, ncols=3)
Embedding visualization¶
In [10]:
Copied!
def plot_layer_embeddings(hidden_states, attention_mask, layers=[0,1], labels=None, reducer="t-SNE", ncols=4, scale_width=5, scale_height=4):
if reducer.lower() == "pca":
dim_reducer = PCA(n_components=2)
elif reducer.lower() == "t-sne":
dim_reducer = TSNE(n_components=2)
elif reducer.lower() == "umap":
dim_reducer = UMAP(n_components=2)
else:
raise("Unsupported dim reducer, please try PCA, t-SNE or UMAP.")
n_layers = len(layers)
nrows = (n_layers + ncols - 1) // ncols
figsize = (ncols * scale_width, nrows * scale_height)
fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
if n_layers == 1:
axes = [axes]
else:
axes = axes.flatten()
for i,layer_i in enumerate(layers):
embeddings = hidden_states[layer_i].detach().numpy()
mean_sequence_embeddings = torch.sum(attention_mask*embeddings, axis=-2) / torch.sum(attention_mask, axis=1)
layer_dim_reduced_vectors = dim_reducer.fit_transform(mean_sequence_embeddings.detach().numpy())
if not labels:
labels = ["Uncategorized"] * layer_dim_reduced_vectors.shape[0]
dataframe = {
'Dimension 1': layer_dim_reduced_vectors[:,0],
'Dimension 2': layer_dim_reduced_vectors[:,1],
'labels': labels
}
df = pd.DataFrame.from_dict(dataframe)
sns.scatterplot(
data=df,
x='Dimension 1',
y='Dimension 2',
hue='labels', ax=axes[i]
)
axes[i].set_title(f"Layer {layer_i+1}")
for j in range(i+1, len(axes)):
fig.delaxes(axes[j])
fig.tight_layout()
plt.show()
def plot_layer_embeddings(hidden_states, attention_mask, layers=[0,1], labels=None, reducer="t-SNE", ncols=4, scale_width=5, scale_height=4):
if reducer.lower() == "pca":
dim_reducer = PCA(n_components=2)
elif reducer.lower() == "t-sne":
dim_reducer = TSNE(n_components=2)
elif reducer.lower() == "umap":
dim_reducer = UMAP(n_components=2)
else:
raise("Unsupported dim reducer, please try PCA, t-SNE or UMAP.")
n_layers = len(layers)
nrows = (n_layers + ncols - 1) // ncols
figsize = (ncols * scale_width, nrows * scale_height)
fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
if n_layers == 1:
axes = [axes]
else:
axes = axes.flatten()
for i,layer_i in enumerate(layers):
embeddings = hidden_states[layer_i].detach().numpy()
mean_sequence_embeddings = torch.sum(attention_mask*embeddings, axis=-2) / torch.sum(attention_mask, axis=1)
layer_dim_reduced_vectors = dim_reducer.fit_transform(mean_sequence_embeddings.detach().numpy())
if not labels:
labels = ["Uncategorized"] * layer_dim_reduced_vectors.shape[0]
dataframe = {
'Dimension 1': layer_dim_reduced_vectors[:,0],
'Dimension 2': layer_dim_reduced_vectors[:,1],
'labels': labels
}
df = pd.DataFrame.from_dict(dataframe)
sns.scatterplot(
data=df,
x='Dimension 1',
y='Dimension 2',
hue='labels', ax=axes[i]
)
axes[i].set_title(f"Layer {layer_i+1}")
for j in range(i+1, len(axes)):
fig.delaxes(axes[j])
fig.tight_layout()
plt.show()
In [11]:
Copied!
# Get the layer embeddings
hidden_states = outputs['hidden_states']
attention_mask = torch.unsqueeze(torch.tensor(tokens_idx), dim=-1)
plot_layer_embeddings(hidden_states, attention_mask, layers=range(6), labels=None, reducer="t-SNE", ncols=3)
# Get the layer embeddings
hidden_states = outputs['hidden_states']
attention_mask = torch.unsqueeze(torch.tensor(tokens_idx), dim=-1)
plot_layer_embeddings(hidden_states, attention_mask, layers=range(6), labels=None, reducer="t-SNE", ncols=3)
Probability of token¶
In [12]:
Copied!
def get_token_probability(probabilities, idx=0, top_k=5):
tokens_probs = []
probas = probabilities[idx]
for pos, probs in enumerate(probas):
sorted_positions = np.argsort(-probs)
sorted_probs = probs[sorted_positions]
token_probs = {}
for k in range(top_k):
predicted_token = tokenizer.id_to_token(int(sorted_positions[k]))
prob = sorted_probs[k]
# print(f"seq_id: {idx}, token_position: {pos}, k: {k}, token: {predicted_token}, probability: {prob * 100:.2f}%")
token_probs[predicted_token] = prob
tokens_probs.append(token_probs)
return tokens_probs
def get_token_probability(probabilities, idx=0, top_k=5):
tokens_probs = []
probas = probabilities[idx]
for pos, probs in enumerate(probas):
sorted_positions = np.argsort(-probs)
sorted_probs = probs[sorted_positions]
token_probs = {}
for k in range(top_k):
predicted_token = tokenizer.id_to_token(int(sorted_positions[k]))
prob = sorted_probs[k]
# print(f"seq_id: {idx}, token_position: {pos}, k: {k}, token: {predicted_token}, probability: {prob * 100:.2f}%")
token_probs[predicted_token] = prob
tokens_probs.append(token_probs)
return tokens_probs
In [13]:
Copied!
logits = outputs['logits'].detach().numpy()
probabilities = []
# get probabilities separately for each seq as they have different lengths
for idx in range(logits.shape[0]):
logits_seq = logits[idx]
logits_seq = [logits_seq[i] for i,b in enumerate(tokens_idx[idx]) if b]
probs = softmax(logits_seq, axis=-1) # use softmax to transform logits into probabilities
probabilities.append(probs)
tokens_probs = get_token_probability(probabilities, idx=1, top_k=5)
tokens_probs
logits = outputs['logits'].detach().numpy()
probabilities = []
# get probabilities separately for each seq as they have different lengths
for idx in range(logits.shape[0]):
logits_seq = logits[idx]
logits_seq = [logits_seq[i] for i,b in enumerate(tokens_idx[idx]) if b]
probs = softmax(logits_seq, axis=-1) # use softmax to transform logits into probabilities
probabilities.append(probs)
tokens_probs = get_token_probability(probabilities, idx=1, top_k=5)
tokens_probs
Out[13]:
[{'CGAGTT': 0.17259638,
'ATTTTG': 0.074336775,
'AATTTT': 0.03158132,
'ACTTTG': 0.029733788,
'ACTTTT': 0.027789203},
{'TCGTAT': 0.9961033,
'CCGTAT': 0.002380367,
'TTGTAT': 0.00013929073,
'TCGCAT': 0.00012747609,
'TCATAT': 8.166215e-05},
{'GGTAAC': 0.99959,
'GGTAAT': 0.00017227474,
'GGGTAA': 2.4163122e-05,
'GGTAAA': 2.2399232e-05,
'TGGTAA': 2.146018e-05},
{'CATCAC': 0.97606814,
'ATCCCC': 0.005534643,
'ATCACC': 0.0022881867,
'CGTCAC': 0.0019009738,
'ATTCCC': 0.0010483828},
{'GGGCTT': 0.99871385,
'GGACTT': 0.0001330908,
'GGGTTT': 0.000121900695,
'AGGCTT': 0.00011995096,
'CGGGTT': 7.997676e-05},
{'AATTAC': 0.9997267,
'GATTAC': 4.0375737e-05,
'ACTTAC': 2.550763e-05,
'ATTTAC': 1.6145128e-05,
'AACTAC': 1.5840844e-05},
{'CCGCAG': 0.999956,
'CCGCAC': 1.18452435e-05,
'CCACAG': 6.178044e-06,
'CTGCAG': 3.992571e-06,
'CCGCGG': 3.4355671e-06},
{'CGCATA': 0.99955136,
'TGCATA': 0.00012145034,
'AGCGTA': 4.034968e-05,
'AGCATA': 3.7026664e-05,
'CGCGTA': 3.6516085e-05},
{'GTACCG': 0.9998487,
'TACCGG': 2.1025691e-05,
'GTACCA': 2.0927728e-05,
'GGACCG': 2.0473371e-05,
'GTTCCG': 1.207671e-05},
{'TCTACC': 0.9964585,
'TCTATC': 0.0021104612,
'TCTACT': 0.0005306293,
'TCTGCC': 0.0003090096,
'TCCACC': 0.0002797095},
{'GTGACT': 0.99964905,
'GCGACT': 6.814439e-05,
'GTGACA': 4.108843e-05,
'ATGACT': 3.0910054e-05,
'CTGACT': 2.6076928e-05},
{'CTAAGC': 0.9998443,
'CTAAGT': 3.1842705e-05,
'CCAAGC': 2.6327254e-05,
'GTAAGC': 1.9758221e-05,
'CTGAGC': 1.924362e-05},
{'ACGGGT': 0.99969983,
'GCGGGT': 7.628152e-05,
'ACGGGC': 5.7914764e-05,
'ACAGGT': 4.4192246e-05,
'ATGGGT': 1.8808125e-05},
{'CCCCGC': 0.9998889,
'CCCAGC': 2.2182645e-05,
'CCCTGC': 2.1937427e-05,
'CCCGCC': 1.8683259e-05,
'TCCCGC': 1.6611688e-05},
{'TTTACT': 0.9978835,
'TTTAGT': 0.00022872274,
'TTTATT': 0.00021545753,
'TTTAGA': 0.00012321863,
'TTTCCT': 0.00010700531},
{'ATTGCC': 0.9996586,
'ATTGTC': 7.325049e-05,
'ATTGCT': 6.860292e-05,
'ATTGCG': 2.8827857e-05,
'ATTGCA': 2.580576e-05},
{'TCAGCG': 0.9995969,
'TCAACG': 0.0001207212,
'TCAGTG': 6.348614e-05,
'TCAGCA': 2.96002e-05,
'TCAGAG': 2.6319307e-05},
{'ATTTGC': 0.9996542,
'ATTTGT': 9.031595e-05,
'ATTCGC': 3.39291e-05,
'ACTTGC': 1.8995961e-05,
'ATTGGC': 1.5615282e-05},
{'ACCCTG': 0.9983924,
'ACCCCG': 0.00037356524,
'GCCCTG': 0.00019969455,
'ACGCTG': 0.00016784917,
'ACTCTG': 8.21346e-05},
{'GGATGA': 0.98345125,
'AGATGA': 0.0083533535,
'AGATAA': 0.0011463522,
'GGATAA': 0.0009950879,
'GGACGA': 0.0007692377},
{'GCCATA': 0.99653244,
'GCCGTA': 0.0017881092,
'CCCATA': 0.0005162992,
'GCTGTA': 0.00028362343,
'ACCATA': 0.00013441472},
{'TTACGG': 0.9990121,
'TTACAA': 0.00045957882,
'TTACGA': 0.00017892425,
'TTACAG': 0.00012856742,
'TTATGG': 3.4307715e-05},
{'CAACTT': 0.9993787,
'TAACTT': 7.4429176e-05,
'CAACTG': 4.3916898e-05,
'CAATTT': 3.8595674e-05,
'CAACTC': 3.402011e-05},
{'GGGTCG': 0.9997696,
'GGGTCA': 4.0975166e-05,
'GGGTTG': 3.3173714e-05,
'GGGTCT': 1.3578625e-05,
'GGTCGG': 1.3248425e-05},
{'GCAAGT': 0.99595284,
'ACAAGT': 0.0010641017,
'GCGAGT': 0.00079922826,
'GCAAGC': 0.0002992736,
'TCAAGT': 0.00022391975},
{'ATGTTC': 0.9975695,
'ACGTTC': 0.0007419577,
'GTGTTC': 0.0002995831,
'ATATTC': 0.0002683154,
'ATGTTG': 0.00021825168},
{'AATAGC': 0.9996363,
'AATAGT': 7.465532e-05,
'AATAGA': 4.515491e-05,
'CATAGC': 3.187673e-05,
'AATGGC': 2.12161e-05},
{'GTACTC': 0.9995332,
'GTACTT': 0.00040913644,
'GTACGC': 1.0108596e-05,
'GTATTC': 9.164296e-06,
'GTACCC': 6.7496317e-06},
{'CTTCAC': 0.99902916,
'CTTCAG': 0.0005663351,
'TTTCAC': 0.00010902189,
'CTTCAT': 3.6280082e-05,
'CATCAC': 2.6068687e-05},
{'TCAGTG': 0.99983084,
'CCAGTG': 2.8872284e-05,
'TCATTG': 1.5589605e-05,
'TCACTG': 1.1398803e-05,
'TCAGAG': 1.0691216e-05},
{'ATTGCT': 0.99978036,
'ATTGCC': 7.3094234e-05,
'GTTGCT': 4.9736147e-05,
'ATTGTT': 2.7429634e-05,
'ATTACT': 1.4872187e-05},
{'ATGTCC': 0.99388593,
'ATGCCC': 0.002409024,
'ACGTCC': 0.0010945916,
'ATGTCT': 0.0004333326,
'ATATCC': 0.0004030408},
{'AACGTG': 0.9973693,
'AACATG': 0.0002474348,
'GACGTG': 0.00018803342,
'AACGTC': 0.00018510975,
'AATGTG': 0.0001479922},
{'CATTTT': 0.9995813,
'CGTTTT': 3.7846006e-05,
'AAGTTT': 3.2062322e-05,
'TTTTTT': 2.970213e-05,
'AATTTT': 2.6819005e-05},
{'GTCGCG': 0.99959356,
'GTCGTG': 0.000115663956,
'CGCGGG': 3.4483375e-05,
'GCGGCG': 3.386643e-05,
'GTCACG': 2.5877345e-05},
{'GAGAGA': 0.998789,
'AGAGAG': 0.00019560783,
'GAGAGG': 0.00014330231,
'GGGAGA': 9.80927e-05,
'AAGAGA': 7.373096e-05},
{'GTCGTA': 0.99971396,
'GTTGTA': 7.912869e-05,
'GCCGTA': 7.065815e-05,
'GTCATA': 3.3755936e-05,
'GTCGTG': 2.5411036e-05},
{'CAGAAT': 0.99983215,
'GAGAAT': 7.2248535e-05,
'CAGGAT': 1.6000364e-05,
'CAAAAT': 1.3472809e-05,
'CAGAAC': 9.629115e-06},
{'TCTCGA': 0.999401,
'TCTTGA': 0.0002367876,
'TCGCGA': 0.0001701102,
'TTTCGA': 3.5704878e-05,
'TCTCAA': 1.5103107e-05}]
In [ ]:
Copied!
In [ ]:
Copied!