1
1
#!/usr/bin/env python
2
2
from pathlib import Path
3
3
4
+ import pandas as pd
5
+
4
6
from github_stats_pages import db
5
7
from github_stats_pages .logger import app_log as log
6
- from github_stats_pages .models import Clone , Traffic , Paths
8
+ from github_stats_pages .models import Clone , Traffic , Referring , Paths
9
+
10
+ DROP_DUPLICATES_SUBSET = ["date" , "repository_name" , "site" ]
7
11
8
12
9
13
if __name__ == "__main__" :
@@ -14,13 +18,29 @@ if __name__ == "__main__":
14
18
log .info ("SQLite DB exists!" )
15
19
engine = db .create_db_and_tables ()
16
20
17
- merged_files = [
18
- x
19
- for x in sorted (Path ("data" ).glob ("merged_*.csv" ))
20
- if x .name .find ("refer" ) == - 1
21
- ]
21
+ p_data = Path ("data" )
22
+
23
+ # Handle referrer files (missing date field)
24
+ referrer_files = list (p_data .glob ("*referrer-stats.csv" ))
25
+ log .info (f"Number of referrer files: { len (referrer_files )} " )
26
+ referrer_merged_df = pd .DataFrame ()
27
+ for r_file in referrer_files :
28
+ file_date = r_file .name .rstrip ("data/" )[:10 ]
29
+ r_df = pd .read_csv (r_file )
30
+ r_df .insert (loc = 0 , column = "date" , value = file_date )
31
+ referrer_merged_df = referrer_merged_df .append (r_df , ignore_index = True )
32
+ if not referrer_merged_df .empty :
33
+ referrer_merged_df .drop_duplicates (
34
+ subset = DROP_DUPLICATES_SUBSET , keep = "last" , inplace = True
35
+ )
36
+ log .info (f"Referrer record number: { len (referrer_merged_df )} " )
37
+ referrer_outfile = p_data / "merged_referrer.csv"
38
+ log .info (f"Writing: { referrer_outfile } " )
39
+ referrer_merged_df .to_csv (referrer_outfile , header = False , index = False )
40
+
41
+ merged_files = [x for x in sorted (p_data .glob ("merged_*.csv" ))]
22
42
if merged_files :
23
- model_list = [Clone , Paths , Traffic ]
43
+ model_list = [Clone , Paths , Referring , Traffic ]
24
44
for file , model in zip (merged_files , model_list ):
25
45
db .migrate_csv (file , model = model , engine = engine )
26
46
else :
0 commit comments