3
3
##################################
4
4
import pandas as pd
5
5
import os
6
+ from .utils import load_and_subset_gff
6
7
7
8
####################################
8
9
### check MGEs annotations stats ###
@@ -14,6 +15,9 @@ def mges_stats(bacannot_summary):
14
15
15
16
# load dir of samples' results
16
17
results_dir = bacannot_summary [sample ]['results_dir' ]
18
+
19
+ # load gff_file
20
+ gff_file = f"{ results_dir } /gffs/{ sample } .gff"
17
21
18
22
# integron_finder
19
23
if os .path .exists (f"{ results_dir } /integron_finder/{ sample } _integrons.gff" ) and os .stat (f"{ results_dir } /integron_finder/{ sample } _integrons.gff" ).st_size > 0 :
@@ -54,4 +58,56 @@ def mges_stats(bacannot_summary):
54
58
bacannot_summary [sample ]['MGE' ]['integron_finder' ][seq ][id ]['end' ] = row ['end' ]
55
59
bacannot_summary [sample ]['MGE' ]['integron_finder' ][seq ][id ]['type' ] = row ['atts' ].split (';' )[1 ].split ('=' )[- 1 ]
56
60
bacannot_summary [sample ]['MGE' ]['integron_finder' ][seq ][id ]['source' ] = row ['source' ]
57
- bacannot_summary [sample ]['MGE' ]['integron_finder' ][seq ][id ]['product' ] = row ['type' ]
61
+ bacannot_summary [sample ]['MGE' ]['integron_finder' ][seq ][id ]['product' ] = row ['type' ]
62
+
63
+ # ICE database
64
+ ice_db_blastp = f"{ results_dir } /ICEs/{ sample } _iceberg_blastp_onGenes.summary.txt"
65
+ if os .path .exists (ice_db_blastp ) and os .stat (ice_db_blastp ).st_size > 0 :
66
+
67
+ # init MGE annotation dictionary
68
+ if 'MGE' not in bacannot_summary [sample ]:
69
+ bacannot_summary [sample ]['MGE' ] = {}
70
+
71
+ # init iceberg annotation dictionary
72
+ if 'ICE' not in bacannot_summary [sample ]['MGE' ]:
73
+ bacannot_summary [sample ]['MGE' ]['ICEberg' ] = {}
74
+
75
+ # init iceberg blastp annotation dictionary
76
+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ] = {}
77
+
78
+ # load integron_finder results
79
+ results = pd .read_csv (
80
+ ice_db_blastp ,
81
+ sep = '\t '
82
+ )
83
+
84
+ # load gff
85
+ gff = load_and_subset_gff (gff_file , 'source' , 'ICE' )
86
+
87
+ # number of integron_finder annotations
88
+ total_number = len (results .index )
89
+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ]['total' ] = total_number
90
+
91
+ # per gene info
92
+ if int (results .shape [0 ]) > 0 :
93
+ for seq in [ str (x ) for x in results ['SEQUENCE' ].unique () ]:
94
+
95
+ # details missing in output but available in gff
96
+ gff_row = gff [gff ['attributes' ].str .contains (seq )]
97
+ contig = gff_row ['seq' ].item ()
98
+ start = gff_row ['start' ].item ()
99
+ end = gff_row ['end' ].item ()
100
+
101
+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ] = {}
102
+ for index , row in results [results ['SEQUENCE' ] == seq ].reset_index ().iterrows ():
103
+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ] = {}
104
+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['id' ] = row ['ICEBERG_ID' ]
105
+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['contig' ] = contig
106
+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['start' ] = start
107
+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq][id ]['end' ] = end
108
+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['accession' ] = row ['ACCESSION' ]
109
+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['product' ] = row ['PRODUCT' ]
110
+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['description' ] = row ['DESCRIPTION' ]
111
+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['blast_start' ] = row ['START' ]
112
+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['blast_end' ] = row ['END' ]
113
+ bacannot_summary [sample ]['MGE' ]['ICEberg' ]['blastp' ][seq ][id ]['strand' ] = row ['STRAND' ]
0 commit comments