-
Notifications
You must be signed in to change notification settings - Fork 0
/
metrics_use.py
71 lines (51 loc) · 2.35 KB
/
metrics_use.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
import os
data = pd.read_csv("/home/ubuntu/graph_genomes/data/2019.04.12/protein_name.txt",delimiter=':>',engine='python',header=None,names=['cluster','protein'])
# dropping null value columns to avoid errors
data.dropna(inplace = True)
# new data frame with split value columns
new = data["protein"].str.split("_", n = 1, expand = True)
# making seperate first name column from new data frame
data["accession_no"] = new[0]
# making seperate last name column from new data frame
data["prot"] = new[1]
# Dropping old Name columns
data.drop(columns=["prot"], inplace=True)
# create new column for checking prot from genome in cluster
data['unique']=data['accession_no']+'_'+data['prot']
unique = pd.crosstab(data['unique'],
data['cluster'],
margins = False)
# How many clusters there are
df=pd.DataFrame(pd.crosstab(data['cluster'],data['accession_no']))
print('\n 1. There are %i clusters' % df.shape[0])
# How big is the average cluster
# p1=df.plot(kind="bar",figsize=(8,8),stacked=True)
no_prot_in_clust = pd.crosstab(data['cluster'],
data['prot'],
margins = False)
prot = pd.DataFrame(no_prot_in_clust)
prot['sum']=no_prot_in_clust.sum(axis=1)
print('\n 2. Number of protein in each cluster:')
print(prot['sum'])
# p2=prot.plot(kind="bar",figsize=(10,8),stacked=True)
clust = pd.crosstab(data['accession_no'],
data['cluster'],
margins = False)
# p3=clust.plot(kind="bar",figsize=(10,11),stacked=True)
# How many clusters have exactly one protein from every genome
print("\n 3. There are %i clusters have exactly one protein from every genome" % sum(clust.sum(axis=0)==clust.shape[0]))
multi_clust = pd.crosstab(data['prot'],
data['cluster'],
margins = False)
# p4=multi_clust.plot(kind="bar",figsize=(10,10),stacked=True)
unique = pd.crosstab(data['unique'],
data['cluster'],
margins = False)
#unique.sum(axis=1)
x=pd.DataFrame(unique)
x['test']=unique.sum(axis=1)
print('\n 4. %i proteins are in multiple cluster' % sum(x['test']>1))