8000 added ICEberg database blastp results · fmalmeida/pythonScripts@bc006d6 · GitHub
[go: up one dir, main page]

Skip to content

Commit bc006d6

Browse files
committed
added ICEberg database blastp results
1 parent bc0231a commit bc006d6

File tree

1 file changed

+57
-1
lines changed

1 file changed

+57
-1
lines changed

falmeida_py/mges_function.py

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
##################################
44
import pandas as pd
55
import os
6+
from .utils import load_and_subset_gff
67

78
####################################
89
### check MGEs annotations stats ###
@@ -14,6 +15,9 @@ def mges_stats(bacannot_summary):
1415

1516
# load dir of samples' results
1617
results_dir = bacannot_summary[sample]['results_dir']
18+
19+
# load gff_file
20+
gff_file = f"{results_dir}/gffs/{sample}.gff"
1721

1822
# integron_finder
1923
if os.path.exists(f"{results_dir}/integron_finder/{sample}_integrons.gff") and os.stat(f"{results_dir}/integron_finder/{sample}_integrons.gff").st_size > 0:
@@ -54,4 +58,56 @@ def mges_stats(bacannot_summary):
5458
bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['end'] = row['end']
5559
bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['type'] = row['atts'].split(';')[1].split('=')[-1]
5660
bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['source'] = row['source']
57-
bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['product'] = row['type']
61+
bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['product'] = row['type']
62+
63+
# ICE database
64+
ice_db_blastp = f"{results_dir}/ICEs/{sample}_iceberg_blastp_onGenes.summary.txt"
65+
if os.path.exists(ice_db_blastp) and os.stat(ice_db_blastp).st_size > 0:
66+
67+
# init MGE annotation dictionary
68+
if 'MGE' not in bacannot_summary[sample]:
69+
bacannot_summary[sample]['MGE'] = {}
70+
71+
# init iceberg annotation dictionary
72+
if 'ICE' not in bacannot_summary[sample]['MGE']:
73+
bacannot_summary[sample]['MGE']['ICEberg'] = {}
74+
75+
# init iceberg blastp annotation dictionary
76+
bacannot_summary[sample]['MGE']['ICEberg']['blastp'] = {}
77+
78+
# load integron_finder results
79+
results = pd.read_csv(
80+
ice_db_blastp,
81+
sep='\t'
82+
)
83+
84+
# load gff
85+
gff = load_and_subset_gff(gff_file, 'source', 'ICE')
86+
87+
# number of integron_finder annotations
88+
total_number = len(results.index)
89+
bacannot_summary[sample]['MGE']['ICEberg']['blastp']['total'] = total_number
90+
91+
# per gene info
92+
if int(results.shape[0]) > 0:
93+
for seq in [ str(x) for x in results['SEQUENCE'].unique() ]:
94+
95+
# details missing in output but available in gff
96+
gff_row = gff[gff['attributes'].str.contains(seq)]
97+
contig = gff_row['seq'].item()
98+
start = gff_row['start'].item()
99+
end = gff_row['end'].item()
100+
101+
bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq] = {}
102+
for index, row in results[results['SEQUENCE'] == seq].reset_index().iterrows():
103+
bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id] = {}
104+
bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['id'] = row['ICEBERG_ID']
105+
bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['contig'] = contig
106+
bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['start'] = start
107+
bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['end'] = end
108+
bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['accession'] = row['ACCESSION']
109+
bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['product'] = row['PRODUCT']
110+
bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['description'] = row['DESCRIPTION']
111+
bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['blast_start'] = row['START']
112+
bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['blast_end'] = row['END']
113+
bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['strand'] = row['STRAND']

0 commit comments

Comments
 (0)
0