Skip to content

Commit ccfc43e

Browse files
new
0 parents  commit ccfc43e

File tree

7 files changed

+5092
-0
lines changed

7 files changed

+5092
-0
lines changed

README.md

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# findTarget is pipeline for searching phi29 homologs by using HMMER program
2+
3+
## Annotation
4+
5+
Пайплайн написан в рамках GenHack 2020 для рашения задачи - Предсказание полимеразной активности новых Φ29 ДНК-полимераз.
6+
7+
## Requirements
8+
9+
PYTHON:
10+
* python >= 3.8
11+
* biopython: `pip3 install biopython`
12+
13+
TOOLS:
14+
* HMMER: conda install -c conda-forge hammer or from source http://hmmer.org
15+
16+
17+
## Installation
18+
19+
```
20+
git clone https://github.com/ubercomrade/pipeline.git
21+
```
22+
23+
## Usage
24+
The command `findTarget -h` return:
25+
26+
```
27+
usage: findTarget.py [-h] [-t TMP_DIR] genebank fasta results tag
28+
29+
positional arguments:
30+
genebank path to GeneBank file
31+
fasta path to write Fasta (parsed CDS from GeneBank file)
32+
results dir to write results
33+
tag tag for output files
34+
35+
optional arguments:
36+
-h, --help show this help message and exit
37+
-t TMP_DIR, --tmp TMP_DIR
38+
tmp dir
39+
```
40+
### Example run
41+
```
42+
findTarget.py \
43+
path/to/gbk/12_45.gbk \
44+
path/to/write/parsed/CDS/12_45.fasta \
45+
dir/to/write/res/ \
46+
tag-of-files
47+
```
48+
49+
## License
50+
51+
Copyright (c) 2020 Trituration
52+
53+
Permission is hereby granted, free of charge, to any person obtaining a copy
54+
of this software and associated documentation files (the "Software"), to deal
55+
in the Software without restriction, including without limitation the rights
56+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
57+
copies of the Software, and to permit persons to whom the Software is
58+
furnished to do so, subject to the following conditions:
59+
60+
The above copyright notice and this permission notice shall be included in all
61+
copies or substantial portions of the Software.
62+
63+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
64+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
65+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
66+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
67+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
68+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
69+
SOFTWARE.

findTarget.py

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
from Bio import SeqIO
2+
import os
3+
import os.path
4+
import sys
5+
import shutil
6+
import pathlib
7+
import subprocess
8+
import argparse
9+
10+
11+
def split_gbk(path, tmp_dir):
12+
with open(path) as file:
13+
container = []
14+
index = 1
15+
for line in file:
16+
if line.startswith('//'):
17+
container.append(line)
18+
with open(tmp_dir + '/tmp.{}.gb'.format(index), 'w') as tmp:
19+
for line in container:
20+
tmp.write(line)
21+
tmp.close()
22+
container = []
23+
index += 1
24+
else:
25+
container.append(line)
26+
return(0)
27+
28+
29+
def write_fasta(data, path):
30+
with open(path, 'a') as file:
31+
for record in data:
32+
file.write('>{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n'.format(record['locus'],
33+
record['start'],
34+
record['end'],
35+
record['strand'],
36+
record['product'],
37+
record['protein_id'],
38+
record['locus_tag']))
39+
file.write(record['seq'] + '\n\n')
40+
return(0)
41+
42+
43+
def parse_gbk(tmp_dir, file):
44+
container = []
45+
data = SeqIO.read("{0}/{1}".format(tmp_dir, file), "genbank")
46+
cds = [f for f in data.features if f.type == "CDS"]
47+
for i in cds:
48+
try:
49+
record = {}
50+
record['seq'] = i.qualifiers['translation'][0]
51+
record['locus'] = data.id
52+
record['locus_tag'] = i.qualifiers['locus_tag'][0]
53+
record['product'] = i.qualifiers['product'][0]
54+
record['protein_id']= i.qualifiers['protein_id'][0]
55+
record['start'] = int(i.location.start)
56+
record['end'] = int(i.location.end)
57+
record['strand'] = i.location.strand
58+
container.append(record)
59+
except:
60+
print('problem with {}'.format(i.qualifiers['locus_tag'][0]) )
61+
return(container)
62+
63+
64+
65+
def parse_gbk(tmp_dir, file):
66+
container = []
67+
data = SeqIO.read("{0}/{1}".format(tmp_dir, file), "genbank")
68+
cds = [f for f in data.features if f.type == "CDS"]
69+
for i in cds:
70+
record = {}
71+
record['seq'] = i.qualifiers['translation'][0]
72+
record['locus'] = data.id
73+
record['locus_tag'] = i.qualifiers['locus_tag'][0]
74+
record['product'] = i.qualifiers['product'][0]
75+
record['protein_id']= i.qualifiers['protein_id'][0]
76+
record['start'] = int(i.location.start)
77+
record['end'] = int(i.location.end)
78+
record['strand'] = i.location.strand
79+
container.append(record)
80+
return(container)
81+
82+
83+
84+
def run_hhmer(model_path, fasta_path, results_path):
85+
args = ["hmmsearch", '{}'.format(model_path), '{}'.format(fasta_path)]
86+
r = subprocess.run(args, capture_output=True)
87+
args = ["hmmsearch", '{}'.format(model_path), '{}'.format(fasta_path)]
88+
r = subprocess.run(args, capture_output=True)
89+
with open(results_path, 'wb') as file:
90+
file.write(r.stdout)
91+
file.close()
92+
return(r.stdout)
93+
94+
95+
def gbk_to_fasta(gbk_path, fasta_path, tmp_dir):
96+
if not os.path.exists(tmp_dir):
97+
os.mkdir(tmp_dir)
98+
split_gbk(gbk_path, tmp_dir)
99+
files = [i for i in os.listdir(tmp_dir) if not i.startswith('.')]
100+
for file in files:
101+
data = parse_gbk(tmp_dir, file)
102+
write_fasta(data, fasta_path)
103+
shutil.rmtree(tmp_dir)
104+
return(0)
105+
106+
107+
# def main():
108+
# wd = "/Users/tsukanov/Downloads/"
109+
# gbs = [i for i in os.listdir(wd) if ".gb" in i]
110+
# models = ['phi29-1-191', 'phi29-192-229', 'phi29-398-420', 'phag-polymerase-b']
111+
# models_dir = '/Users/tsukanov/Documents/Хакатон/'
112+
# tmp_dir = wd + "/tmp/"
113+
# for g in gbs:
114+
# print(g)
115+
# gbk_path = wd + g
116+
# name = g.split('.')[0]
117+
# fasta_path = wd + "/{0}.fasta".format(name)
118+
# if os.path.isfile(fasta_path):
119+
# os.remove(fasta_path)
120+
# gbk_to_fasta(gbk_path, fasta_path, tmp_dir)
121+
# for model in models:
122+
# model_path = models_dir + "/{0}.hmm".format(model)
123+
# results_path = wd + "/hmmer_{0}_res_{1}.txt".format(name, model)
124+
# run_hhmer(model_path, fasta_path, results_path)
125+
# return(0)
126+
127+
128+
129+
def parse_args():
130+
parser = argparse.ArgumentParser()
131+
parser.add_argument('genebank', action='store', help='path to GeneBank file')
132+
parser.add_argument('fasta', action='store', help='path to write Fasta (parsed CDS from GeneBank file)')
133+
parser.add_argument('results', action='store', help='dir to write results')
134+
parser.add_argument('tag', action='store', help='tag for output files')
135+
parser.add_argument('-t', '--tmp', action='store', dest='tmp_dir',
136+
required=False, default='./tmp', help='tmp dir')
137+
if len(sys.argv) == 1:
138+
parser.print_help(sys.stderr)
139+
sys.exit(1)
140+
return(parser.parse_args())
141+
142+
143+
def main():
144+
args = parse_args()
145+
gbk_path = args.genebank
146+
fasta_path = args.fasta
147+
results_dir = args.results
148+
tag = args.tag
149+
tmp_dir = args.tmp_dir
150+
this_dir, this_filename = os.path.split(__file__)
151+
models_dir = os.path.join(this_dir, 'models')
152+
models = ['phi29-1-191', 'phi29-192-229', 'phi29-398-420', 'phage-polymerase-b']
153+
if os.path.isfile(fasta_path):
154+
os.remove(fasta_path)
155+
gbk_to_fasta(gbk_path, fasta_path, tmp_dir)
156+
for model in models:
157+
model_path = models_dir + "/{0}.hmm".format(model)
158+
results_path = results_dir + "/hmmer_{0}_res_{1}.txt".format(tag, model)
159+
run_hhmer(model_path, fasta_path, results_path)
160+
return(0)
161+
162+
163+
if __name__=="__main__":
164+
main()

0 commit comments

Comments
 (0)