-
Notifications
You must be signed in to change notification settings - Fork 0
/
report.py
79 lines (53 loc) · 2.14 KB
/
report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# -*- coding: utf-8 -*-
"""
This is script file for generating output after running alignment
fastq files (raw read) to genome graph
"""
#from Bio import SeqIO
import pandas as pd
import os
import csv
import numpy as np
import matplotlib.pyplot as mpl
path=os.getcwd()
# read csv file that contain number of protein per species in support_files folder
#file = path+'/support_files/access_vs_prot.csv'
#with open(file) as fh:
# rd = csv.DictReader(fh, delimiter=',')
# create data frame and the split gene column to accession_id and protein name
report = pd.read_csv("summarise.csv",sep="\t",names=['gene',"read_count","length","coverage"])
# dropping null value columns to avoid errors
report.dropna(inplace = True)
# new data frame with split value columns
new = report["gene"].str.split("_", n = 1, expand = True)
# making seperate first name column from new data frame
report["accession_id"] = new[0]
# making seperate last name column from new data frame
report["prot"] = new[1]
# dictionary to get number of mature peptides extracted for each species
acc_no_vs_prot = pd.read_csv("support_files/access_vs_prot.csv",sep=",",names=[''])
acc_no_vs_prot.head()
d = acc_no_vs_prot.to_dict()['']
total_prot = []
for i in report['accession_id']:
total_prot.append(d[i])
report['total_prot'] = total_prot
table1 = pd.crosstab(report['accession_id'],report['accession_id'],margins=False)
def report(align_result_file, table_name):
table = pd.read_csv(align_result_file,sep="\t",names=['gene',"read_count","length","coverage"])
# dropping null value columns to avoid errors
table.dropna(inplace = True)
# new data frame with split value columns
new = table["gene"].str.split("_", n = 1, expand = True)
# making seperate first name column from new data frame
table1["accession_id"] = new[0]
# making seperate last name column from new data frame
table["prot"] = new[1]
total_prot = []
# check dictionary
for i in table['accession_id']:
total_prot.append(d[i])
# assign total protein by accession_id
table['total_prot'] = total_prot
# output
table_name = pd.crosstab(table['accession_id'],table['accession_id'],margins=False)