-
Notifications
You must be signed in to change notification settings - Fork 0
/
genbank_parser.py
283 lines (217 loc) · 9.76 KB
/
genbank_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
r"""
Author: Hooman Bahrdo
date of final revision: 5 Jan. 2023
explanation: this main program contains a class (GenbankParser), and one main
function. this program gets a genbank file, highlights the relavant
parts, and make a multi Fasta text file.
inpute lines base on sys are:
1- python3 .\genbank_parser.py .\CFTR_DNA.gb seperated
2- python3 .\genbank_parser.py .\CFTR_DNA.gb uppercased
3- python3 .\genbank_parser.py .\CFTR_mRNA.gb seperated
4- python3 .\genbank_parser.py .\CFTR_mRNA.gb uppercased
5- python3 .\genbank_parser.py .\CFTR_protein.gp seperated
6- python3 .\genbank_parser.py .\CFTR_protein.gp uppercased
"""
import re
import textwrap
import sys
from features import Features
from feature_string import FeatureString
from exceptions import InvalidOriginArgument
class GenbankParser():
"""
type: class
explanation: this class is the main class of the program that contains
three attributes of genbank file, two static methods and six instance
methods. the methods are contributing in building a GenbankParser object
as well as an output multi Fasta text file.
output: genbank objects
"""
def __init__(self, definition, origin, feature_objects):
self.definition = definition
self.origin = origin
self.feature_objects = feature_objects
def __str__(self):
return f"{self.definition} and {self.origin} and {self.feature_objects}"
@staticmethod
def store_genbank_file_in_list(input_file_name):
"""
type: static method
input: a genbank file
explanation: this function store genbank file in a list.
output: a list of genbank file sentences.
"""
try:
with open(input_file_name, encoding="utf-8") as genbank:
genbank_list = []
for line in genbank:
genbank_list.append(line)
except OSError:
print(f'file {genbank} cannot open')
#find any unexpected error and re-raise it.
except Exception as err:
print(f'unexpected error: {type(err)}')
raise
return genbank_list
@staticmethod
def store_features_in_list(genbank_file):
"""
type: static method
input: genbank file list from "store_genbank_file_in_list" method.
explanation: this function extract the raw feature part from the
genbank file.
output: 1- a list of feature sentences
2- the number of feature sentences
"""
feature_list = []
for counter, line in enumerate(genbank_file):
feature_finder = re.search("^FEATURES", line)
origin_finder = re.search("^ORIGIN", line)
if feature_finder:
start_point = counter
elif origin_finder:
end_point = counter
file_size = end_point - start_point
feature_list = genbank_file[start_point + 1: end_point]
return feature_list, file_size
def set_definition(self, genbank_file):
"""
type: instance method
input: 1- a list of genbank file sentences.
2- a primitive instance of GenbankParser object
explanation: this function finds the definition of each genbank file in
the form of a string and set it as the defintion attribute
of the class.
output: GenbankParser.definition
"""
final_definition = " "
definition = []
#slice the definition section
for counter, line in enumerate(genbank_file):
definition_finder = re.search("^DEFINITION", line)
accession_finder = re.search("^ACCESSION", line)
if definition_finder:
start_point = counter
elif accession_finder:
end_point = counter
definition_list = genbank_file[start_point: end_point]
#make definition string and set it as object attribute
for line in definition_list:
definition = re.split(r"\s", line, 1)
line = definition[1].lstrip().strip("\n")
final_definition += line
self.definition = final_definition
def set_origin(self, genbank_file, input_file_name):
"""
type: instance method
input: 1- a list of genbank file sentences
2- a primitive instance of GenbankParser object
explanation: this funtion is used to extract origin as a string and set
it as the origin attribute of the class.
output: GenbankParser.origin
"""
origin = []
final_origin = ""
#slice the origin section
for counter, line in enumerate(genbank_file):
origin_finder = re.search("^ORIGIN", line)
if origin_finder:
origin = genbank_file[counter + 1:]
#make origin string and set it as the attribute of the class
for line in origin:
line = line.lstrip().strip("\n")
elements_of_each_line = line.split(" ")
for element in elements_of_each_line:
base_finder = re.search(r"\D", element)
end_correction = re.search("[^//]", element)
if base_finder and end_correction:
final_origin += element
#check the origin string to check all elements are bases
for counter, element in enumerate(final_origin):
if re.search(r"[^atcg]",element) and re.search(".gb$", input_file_name):
raise InvalidOriginArgument(counter, element)
self.origin = final_origin
def set_feature_list(self, genbank_file):
"""
type: instance method
input: 1- a list of genbank file sentences
2- a primitive instance of GenbankParser object
explanation: this method uses 'store_features_in_list' method to cut the
feature part, then by using 'set_feature_attributes' method
from Feature class, it makes Feature objects. Finally, it
accumulates feature objects in a list and set it as feature
_objects attribute of the class.
output: GenbankParser.feature_objects
"""
#make a feature list
feature_file = GenbankParser.store_features_in_list(genbank_file)
file_size = feature_file[1]
feature_list = feature_file[0]
feature_object_list = []
#make feature objects
for counter in range(0, file_size - 1):
feature_object = Features(None, None, None)
feature_object.set_feature_attributes(counter, feature_list)
#make sure all the feature attributes get correct values and add them to a list
if feature_object.name is not None and feature_object.location is not None \
and feature_object.description is not None:
feature_object_list.append(feature_object)
# set the list as the class attribute
self.feature_objects = feature_object_list
def output_writer(self, out_put_name, program_mode):
"""
type: instance method
input: 1- GenbankParser object
2- output name
3- program mod (seperated or uppercased)
explanation: this method organize all the attributes of a genbank object
into multi FASTA format and store them in a text file. this
function use two methods of Feature's child class (FeatureString)
to extraxt the uppercased or seperated string of a feature string.
output: a multi_FASTA text file for a genbank file.
"""
feature_object_list = self.feature_objects
uppercase_name_finder = re.search("^up.*", program_mode)
out_put_name.write(self.definition)
out_put_name.write("\n"*2)
#write the feature object attributes in the output file
for feature_object in feature_object_list:
out_put_name.write(
f">{feature_object.name}{feature_object.description}\n")
#make FeatureString object and set the correct amount for its attribute
feature = FeatureString(feature_object.name, feature_object.description,
feature_object.location, None)
if uppercase_name_finder:
feature.set_feature_string_uppercase(self.origin)
else:
feature.set_feature_string_seperated(self.origin)
#wrap feature string into 60 digit pieces and write it in output file
feature.string = feature.string.lstrip()
feature_string = textwrap.wrap(feature.string, 60)
for element in feature_string:
out_put_name.write(f"{element}\n")
out_put_name.write("\n")
def main():
"""
type: function
input: 1- name of the genbank file
2- mode of the program
explanation: the main function uses GenbankParser methods to build a
GenbankParser object, also it makes an output text file.
output: an output file
"""
#make the output name from input file's name
input_file_name = sys.argv[1]
program_mode = sys.argv[2]
name = re.sub(".gb|.gp", "_result", input_file_name)
#create a GenbankParser object
genbank_object = GenbankParser(None, None, None)
genbank_file = GenbankParser.store_genbank_file_in_list(input_file_name)
genbank_object.set_definition(genbank_file)
genbank_object.set_origin(genbank_file, input_file_name)
genbank_object.set_feature_list(genbank_file)
#make an output text file
with open(f"{name}.txt", "w", encoding="utf-8") as output_name:
genbank_object.output_writer(output_name, program_mode)
if __name__ == "__main__":
main()