ubercomrade
diff --git a/‎README.md
Lines changed: 69 additions & 0 deletions b/‎README.md
Lines changed: 69 additions & 0 deletions
diff --git a/‎findTarget.py
Lines changed: 164 additions & 0 deletions b/‎findTarget.py
Lines changed: 164 additions & 0 deletions
@@ -0,0 +1,69 @@
+# findTarget is pipeline for searching phi29 homologs by using HMMER program
+
+## Annotation
+
+Пайплайн написан в рамках GenHack 2020 для рашения задачи - Предсказание полимеразной активности новых Φ29 ДНК-полимераз.
+
+## Requirements
+
+PYTHON:
+  * python >= 3.8
+  * biopython: `pip3 install biopython`
+
+TOOLS:
+  * HMMER: conda install -c conda-forge hammer or from source http://hmmer.org
+
+
+## Installation
+
+```  
+git clone https://github.com/ubercomrade/pipeline.git  
+```
+
+## Usage
+The command `findTarget -h` return:
+
+```
+usage: findTarget.py [-h] [-t TMP_DIR] genebank fasta results tag
+
+positional arguments:
+  genebank              path to GeneBank file
+  fasta                 path to write Fasta (parsed CDS from GeneBank file)
+  results               dir to write results
+  tag                   tag for output files
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -t TMP_DIR, --tmp TMP_DIR
+                        tmp dir
+```
+### Example run
+```
+findTarget.py \
+path/to/gbk/12_45.gbk \
+path/to/write/parsed/CDS/12_45.fasta \
+dir/to/write/res/ \
+tag-of-files
+```
+
+## License
+
+Copyright (c) 2020 Trituration
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,164 @@
+from Bio import SeqIO
+import os
+import os.path
+import sys
+import shutil
+import pathlib
+import subprocess
+import argparse
+
+
+def split_gbk(path, tmp_dir):
+    with open(path) as file:
+        container = []
+        index = 1
+        for line in file:
+            if line.startswith('//'):
+                container.append(line)
+                with open(tmp_dir + '/tmp.{}.gb'.format(index), 'w') as tmp:
+                    for line in container:
+                        tmp.write(line)
+                tmp.close()
+                container = []
+                index += 1
+            else:
+                container.append(line)
+    return(0)
+
+
+def write_fasta(data, path):
+    with open(path, 'a') as file:
+        for record in data:
+            file.write('>{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n'.format(record['locus'],
+                                                               record['start'],
+                                                               record['end'],
+                                                               record['strand'],
+                                                               record['product'],
+                                                               record['protein_id'],
+                                                               record['locus_tag']))
+            file.write(record['seq'] + '\n\n')
+    return(0)
+
+
+def parse_gbk(tmp_dir, file):
+    container = []
+    data = SeqIO.read("{0}/{1}".format(tmp_dir, file), "genbank")
+    cds = [f for f in data.features if f.type == "CDS"]
+    for i in cds:
+        try:
+            record = {}
+            record['seq'] = i.qualifiers['translation'][0]
+            record['locus'] = data.id
+            record['locus_tag'] = i.qualifiers['locus_tag'][0]
+            record['product'] = i.qualifiers['product'][0]
+            record['protein_id']= i.qualifiers['protein_id'][0]
+            record['start'] = int(i.location.start)
+            record['end'] = int(i.location.end)
+            record['strand'] = i.location.strand
+            container.append(record)
+        except:
+            print('problem with {}'.format(i.qualifiers['locus_tag'][0]) )
+    return(container)
+
+
+
+def parse_gbk(tmp_dir, file):
+    container = []
+    data = SeqIO.read("{0}/{1}".format(tmp_dir, file), "genbank")
+    cds = [f for f in data.features if f.type == "CDS"]
+    for i in cds:
+        record = {}
+        record['seq'] = i.qualifiers['translation'][0]
+        record['locus'] = data.id
+        record['locus_tag'] = i.qualifiers['locus_tag'][0]
+        record['product'] = i.qualifiers['product'][0]
+        record['protein_id']= i.qualifiers['protein_id'][0]
+        record['start'] = int(i.location.start)
+        record['end'] = int(i.location.end)
+        record['strand'] = i.location.strand
+        container.append(record)
+    return(container)
+
+
+
+def run_hhmer(model_path, fasta_path, results_path):
+    args = ["hmmsearch", '{}'.format(model_path), '{}'.format(fasta_path)]
+    r = subprocess.run(args, capture_output=True)
+    args = ["hmmsearch", '{}'.format(model_path), '{}'.format(fasta_path)]
+    r = subprocess.run(args, capture_output=True)
+    with open(results_path, 'wb') as file:
+        file.write(r.stdout)
+    file.close()
+    return(r.stdout)
+
+
+def gbk_to_fasta(gbk_path, fasta_path, tmp_dir):
+    if not os.path.exists(tmp_dir):
+        os.mkdir(tmp_dir)
+    split_gbk(gbk_path, tmp_dir)
+    files = [i for i in os.listdir(tmp_dir) if not i.startswith('.')]
+    for file in files:
+        data = parse_gbk(tmp_dir, file)
+        write_fasta(data, fasta_path)
+    shutil.rmtree(tmp_dir)
+    return(0)
+
+    
+# def main():
+#     wd = "/Users/tsukanov/Downloads/"
+#     gbs = [i for i in os.listdir(wd) if ".gb" in i]
+#     models = ['phi29-1-191', 'phi29-192-229', 'phi29-398-420', 'phag-polymerase-b']
+#     models_dir = '/Users/tsukanov/Documents/Хакатон/'
+#     tmp_dir = wd + "/tmp/"
+#     for g in gbs:
+#         print(g)
+#         gbk_path = wd + g
+#         name = g.split('.')[0]
+#         fasta_path = wd + "/{0}.fasta".format(name)
+#         if os.path.isfile(fasta_path):
+#             os.remove(fasta_path)
+#         gbk_to_fasta(gbk_path, fasta_path, tmp_dir)
+#         for model in models:
+#             model_path = models_dir + "/{0}.hmm".format(model)
+#             results_path = wd + "/hmmer_{0}_res_{1}.txt".format(name, model)
+#             run_hhmer(model_path, fasta_path, results_path)
+#     return(0)
+
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('genebank', action='store', help='path to GeneBank file')
+    parser.add_argument('fasta', action='store', help='path to write Fasta (parsed CDS from GeneBank file)')
+    parser.add_argument('results', action='store', help='dir to write results')
+    parser.add_argument('tag', action='store', help='tag for output files')
+    parser.add_argument('-t', '--tmp', action='store', dest='tmp_dir',
+                        required=False, default='./tmp', help='tmp dir')
+    if len(sys.argv) == 1:
+        parser.print_help(sys.stderr)
+        sys.exit(1)
+    return(parser.parse_args())
+
+
+def main():
+    args = parse_args()
+    gbk_path = args.genebank
+    fasta_path = args.fasta
+    results_dir = args.results
+    tag = args.tag
+    tmp_dir = args.tmp_dir
+    this_dir, this_filename = os.path.split(__file__)
+    models_dir = os.path.join(this_dir, 'models')
+    models = ['phi29-1-191', 'phi29-192-229', 'phi29-398-420', 'phage-polymerase-b']
+    if os.path.isfile(fasta_path):
+        os.remove(fasta_path)
+    gbk_to_fasta(gbk_path, fasta_path, tmp_dir)
+    for model in models:
+        model_path = models_dir + "/{0}.hmm".format(model)
+        results_path = results_dir + "/hmmer_{0}_res_{1}.txt".format(tag, model)
+        run_hhmer(model_path, fasta_path, results_path)
+    return(0)
+
+
+if __name__=="__main__":
+    main()