fmalmeida
diff --git a/‎falmeida_py/mges_function.py
Lines changed: 57 additions & 1 deletion b/‎falmeida_py/mges_function.py
Lines changed: 57 additions & 1 deletion
@@ -3,6 +3,7 @@
 ##################################
 import pandas as pd
 import os
+from .utils import load_and_subset_gff
 
 ####################################
 ### check MGEs annotations stats ###
@@ -14,6 +15,9 @@ def mges_stats(bacannot_summary):
 
         # load dir of samples' results
         results_dir = bacannot_summary[sample]['results_dir']
+
+        # load gff_file
+        gff_file = f"{results_dir}/gffs/{sample}.gff"
 
         # integron_finder
         if os.path.exists(f"{results_dir}/integron_finder/{sample}_integrons.gff") and os.stat(f"{results_dir}/integron_finder/{sample}_integrons.gff").st_size > 0:
@@ -54,4 +58,56 @@ def mges_stats(bacannot_summary):
                         bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['end'] = row['end']
                         bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['type'] = row['atts'].split(';')[1].split('=')[-1]
                         bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['source'] = row['source']
-                        bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['product'] = row['type']
+                        bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['product'] = row['type']
+
+        # ICE database
+        ice_db_blastp = f"{results_dir}/ICEs/{sample}_iceberg_blastp_onGenes.summary.txt"
+        if os.path.exists(ice_db_blastp) and os.stat(ice_db_blastp).st_size > 0:
+
+            # init MGE annotation dictionary
+            if 'MGE' not in bacannot_summary[sample]:
+                bacannot_summary[sample]['MGE'] = {}
+            
+            # init iceberg annotation dictionary
+            if 'ICE' not in bacannot_summary[sample]['MGE']:
+                bacannot_summary[sample]['MGE']['ICEberg'] = {}
+
+            # init iceberg blastp annotation dictionary
+            bacannot_summary[sample]['MGE']['ICEberg']['blastp'] = {}
+
+            # load integron_finder results
+            results = pd.read_csv(
+                ice_db_blastp,
+                sep='\t'
+            )
+
+            # load gff
+            gff = load_and_subset_gff(gff_file, 'source', 'ICE')
+
+            # number of integron_finder annotations
+            total_number = len(results.index)
+            bacannot_summary[sample]['MGE']['ICEberg']['blastp']['total'] = total_number
+
+            # per gene info
+            if int(results.shape[0]) > 0:
+                for seq in [ str(x) for x in results['SEQUENCE'].unique() ]:
+
+                    # details missing in output but available in gff
+                    gff_row = gff[gff['attributes'].str.contains(seq)]
+                    contig = gff_row['seq'].item()
+                    start  = gff_row['start'].item()
+                    end    = gff_row['end'].item()
+                    
+                    bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq] = {}
+                    for index, row in results[results['SEQUENCE'] == seq].reset_index().iterrows():
+                        bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id] = {}
+                        bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['id']          = row['ICEBERG_ID']
+                        bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['contig']      = contig
+                        bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['start']       = start
+                        bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['end']         = end
+                        bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['accession']   = row['ACCESSION']
+                        bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['product']     = row['PRODUCT']
+                        bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['description'] = row['DESCRIPTION']
+                        bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['blast_start'] = row['START']
+                        bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['blast_end']   = row['END']
+                        bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq][id]['strand']      = row['STRAND']