### Loading the file ###
import pandas as pd
f="C:/MP-CHEW/CHEW/cycle_2/lca.tsv"
df=pd.read_csv(f,sep="\t")#.sample(2000) #you can just sample here for testing
purposes
df["proteins"]=df["proteins"].str.split(", ")
edf=df.explode("proteins")
edf["OX"]=edf["proteins"].str.split("_").apply(lambda x: "_".join(x[-3:])) #these
are the taxonomies (GTDB taxid instead of NCBI)
#Data reduction: trim taxa based on frequency
s=edf.groupby("OX").size()
edf=edf[edf["OX"].isin(s[s>5].index)]
ut=edf["OX"].drop_duplicates()
### create adjacency matrix ###
dfm =edf[["u_ix","OX"]].merge(edf[["u_ix","OX"]],on="u_ix").query("OX_x != OX_y")
out=pd.crosstab(dfm["OX_x"],dfm["OX_y"])
#Data reduction: trim crosstab based on minimum adjacency?
# q=out[out.sum()>2].index
# out=out.loc[q,q]
### Graph construction
import networkx as nx
from node2vec import Node2Vec
graph=nx.from_pandas_adjacency(out)
node2vec=Node2Vec(graph,dimensions=10,walk_length=5,num_walks=20,workers=4) #not
sure what parameters I should select here
model=node2vec.fit(window=10,min_count=1)
### 2D embedding
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
# Retrieve node embeddings and corresponding subjects
node_targets = model.wv.index_to_key # list of node IDs
node_embeddings = (
model.wv.vectors
) # numpy.ndarray of size number of nodes times embeddings dimensionality
trans = TSNE(n_components=2) # or PCA
node_embeddings_2d = trans.fit_transform(node_embeddings)
### Clustering
from sklearn.cluster import DBSCAN
sw=s.loc[node_targets]
clustering = DBSCAN(eps=2,
min_samples=1).fit(node_embeddings_2d)#,sample_weight=sw) #distance would need
optimization?
clusters=clustering.labels_
uc=np.unique(clusters)
cluster_count=len(uc)
#plot clusters
import seaborn as sns
node_colors=np.array(sns.color_palette("Spectral",n_colors=cluster_count))
[clusters]
plt.scatter( node_embeddings_2d[:, 0],
node_embeddings_2d[:, 1],
c=node_colors,)
#plot intensity
plt.scatter( node_embeddings_2d[:, 0],
node_embeddings_2d[:, 1],
c=np.log(s.loc[node_targets].values)
,cmap='Spectral',s=0.2)