1
+ from Bio import SeqIO
2
+ import os
3
+ import os .path
4
+ import sys
5
+ import shutil
6
+ import pathlib
7
+ import subprocess
8
+ import argparse
9
+
10
+
11
+ def split_gbk (path , tmp_dir ):
12
+ with open (path ) as file :
13
+ container = []
14
+ index = 1
15
+ for line in file :
16
+ if line .startswith ('//' ):
17
+ container .append (line )
18
+ with open (tmp_dir + '/tmp.{}.gb' .format (index ), 'w' ) as tmp :
19
+ for line in container :
20
+ tmp .write (line )
21
+ tmp .close ()
22
+ container = []
23
+ index += 1
24
+ else :
25
+ container .append (line )
26
+ return (0 )
27
+
28
+
29
+ def write_fasta (data , path ):
30
+ with open (path , 'a' ) as file :
31
+ for record in data :
32
+ file .write ('>{0}\t {1}\t {2}\t {3}\t {4}\t {5}\t {6}\n ' .format (record ['locus' ],
33
+ record ['start' ],
34
+ record ['end' ],
35
+ record ['strand' ],
36
+ record ['product' ],
37
+ record ['protein_id' ],
38
+ record ['locus_tag' ]))
39
+ file .write (record ['seq' ] + '\n \n ' )
40
+ return (0 )
41
+
42
+
43
+ def parse_gbk (tmp_dir , file ):
44
+ container = []
45
+ data = SeqIO .read ("{0}/{1}" .format (tmp_dir , file ), "genbank" )
46
+ cds = [f for f in data .features if f .type == "CDS" ]
47
+ for i in cds :
48
+ try :
49
+ record = {}
50
+ record ['seq' ] = i .qualifiers ['translation' ][0 ]
51
+ record ['locus' ] = data .id
52
+ record ['locus_tag' ] = i .qualifiers ['locus_tag' ][0 ]
53
+ record ['product' ] = i .qualifiers ['product' ][0 ]
54
+ record ['protein_id' ]= i .qualifiers ['protein_id' ][0 ]
55
+ record ['start' ] = int (i .location .start )
56
+ record ['end' ] = int (i .location .end )
57
+ record ['strand' ] = i .location .strand
58
+ container .append (record )
59
+ except :
60
+ print ('problem with {}' .format (i .qualifiers ['locus_tag' ][0 ]) )
61
+ return (container )
62
+
63
+
64
+
65
+ def parse_gbk (tmp_dir , file ):
66
+ container = []
67
+ data = SeqIO .read ("{0}/{1}" .format (tmp_dir , file ), "genbank" )
68
+ cds = [f for f in data .features if f .type == "CDS" ]
69
+ for i in cds :
70
+ record = {}
71
+ record ['seq' ] = i .qualifiers ['translation' ][0 ]
72
+ record ['locus' ] = data .id
73
+ record ['locus_tag' ] = i .qualifiers ['locus_tag' ][0 ]
74
+ record ['product' ] = i .qualifiers ['product' ][0 ]
75
+ record ['protein_id' ]= i .qualifiers ['protein_id' ][0 ]
76
+ record ['start' ] = int (i .location .start )
77
+ record ['end' ] = int (i .location .end )
78
+ record ['strand' ] = i .location .strand
79
+ container .append (record )
80
+ return (container )
81
+
82
+
83
+
84
+ def run_hhmer (model_path , fasta_path , results_path ):
85
+ args = ["hmmsearch" , '{}' .format (model_path ), '{}' .format (fasta_path )]
86
+ r = subprocess .run (args , capture_output = True )
87
+ args = ["hmmsearch" , '{}' .format (model_path ), '{}' .format (fasta_path )]
88
+ r = subprocess .run (args , capture_output = True )
89
+ with open (results_path , 'wb' ) as file :
90
+ file .write (r .stdout )
91
+ file .close ()
92
+ return (r .stdout )
93
+
94
+
95
+ def gbk_to_fasta (gbk_path , fasta_path , tmp_dir ):
96
+ if not os .path .exists (tmp_dir ):
97
+ os .mkdir (tmp_dir )
98
+ split_gbk (gbk_path , tmp_dir )
99
+ files = [i for i in os .listdir (tmp_dir ) if not i .startswith ('.' )]
100
+ for file in files :
101
+ data = parse_gbk (tmp_dir , file )
102
+ write_fasta (data , fasta_path )
103
+ shutil .rmtree (tmp_dir )
104
+ return (0 )
105
+
106
+
107
+ # def main():
108
+ # wd = "/Users/tsukanov/Downloads/"
109
+ # gbs = [i for i in os.listdir(wd) if ".gb" in i]
110
+ # models = ['phi29-1-191', 'phi29-192-229', 'phi29-398-420', 'phag-polymerase-b']
111
+ # models_dir = '/Users/tsukanov/Documents/Хакатон/'
112
+ # tmp_dir = wd + "/tmp/"
113
+ # for g in gbs:
114
+ # print(g)
115
+ # gbk_path = wd + g
116
+ # name = g.split('.')[0]
117
+ # fasta_path = wd + "/{0}.fasta".format(name)
118
+ # if os.path.isfile(fasta_path):
119
+ # os.remove(fasta_path)
120
+ # gbk_to_fasta(gbk_path, fasta_path, tmp_dir)
121
+ # for model in models:
122
+ # model_path = models_dir + "/{0}.hmm".format(model)
123
+ # results_path = wd + "/hmmer_{0}_res_{1}.txt".format(name, model)
124
+ # run_hhmer(model_path, fasta_path, results_path)
125
+ # return(0)
126
+
127
+
128
+
129
+ def parse_args ():
130
+ parser = argparse .ArgumentParser ()
131
+ parser .add_argument ('genebank' , action = 'store' , help = 'path to GeneBank file' )
132
+ parser .add_argument ('fasta' , action = 'store' , help = 'path to write Fasta (parsed CDS from GeneBank file)' )
133
+ parser .add_argument ('results' , action = 'store' , help = 'dir to write results' )
134
+ parser .add_argument ('tag' , action = 'store' , help = 'tag for output files' )
135
+ parser .add_argument ('-t' , '--tmp' , action = 'store' , dest = 'tmp_dir' ,
136
+ required = False , default = './tmp' , help = 'tmp dir' )
137
+ if len (sys .argv ) == 1 :
138
+ parser .print_help (sys .stderr )
139
+ sys .exit (1 )
140
+ return (parser .parse_args ())
141
+
142
+
143
+ def main ():
144
+ args = parse_args ()
145
+ gbk_path = args .genebank
146
+ fasta_path = args .fasta
147
+ results_dir = args .results
148
+ tag = args .tag
149
+ tmp_dir = args .tmp_dir
150
+ this_dir , this_filename = os .path .split (__file__ )
151
+ models_dir = os .path .join (this_dir , 'models' )
152
+ models = ['phi29-1-191' , 'phi29-192-229' , 'phi29-398-420' , 'phage-polymerase-b' ]
153
+ if os .path .isfile (fasta_path ):
154
+ os .remove (fasta_path )
155
+ gbk_to_fasta (gbk_path , fasta_path , tmp_dir )
156
+ for model in models :
157
+ model_path = models_dir + "/{0}.hmm" .format (model )
158
+ results_path = results_dir + "/hmmer_{0}_res_{1}.txt" .format (tag , model )
159
+ run_hhmer (model_path , fasta_path , results_path )
160
+ return (0 )
161
+
162
+
163
+ if __name__ == "__main__" :
164
+ main ()
0 commit comments