8000 migrate_to_sqlite: Migrate referring data to include date stamp · astrochun/github-stats-pages@5a753a3 · GitHub
[go: up one dir, main page]

Skip to content

Commit 5a753a3

Browse files
committed
migrate_to_sqlite: Migrate referring data to include date stamp
1 parent b0313c0 commit 5a753a3

File tree

1 file changed

+27
-7
lines changed

1 file changed

+27
-7
lines changed

scripts/migrate_to_sqlite

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
#!/usr/bin/env python
22
from pathlib import Path
33

4+
import pandas as pd
5+
46
from github_stats_pages import db
57
from github_stats_pages.logger import app_log as log
6-
from github_stats_pages.models import Clone, Traffic, Paths
8+
from github_stats_pages.models import Clone, Traffic, Referring, Paths
9+
10+
DROP_DUPLICATES_SUBSET = ["date", "repository_name", "site"]
711

812

913
if __name__ == "__main__":
@@ -14,13 +18,29 @@ if __name__ == "__main__":
1418
log.info("SQLite DB exists!")
1519
engine = db.create_db_and_tables()
1620

17-
merged_files = [
18-
x
19-
for x in sorted(Path("data").glob("merged_*.csv"))
20-
if x.name.find("refer") == -1
21-
]
21+
p_data = Path("data")
22+
23+
# Handle referrer files (missing date field)
24+
referrer_files = list(p_data.glob("*referrer-stats.csv"))
25+
log.info(f"Number of referrer files: {len(referrer_files)}")
26+
referrer_merged_df = pd.DataFrame()
27+
for r_file in referrer_files:
28+
file_date = r_file.name.rstrip("data/")[:10]
29+
r_df = pd.read_csv(r_file)
30+
r_df.insert(loc=0, column="date", value=file_date)
31+
referrer_merged_df = referrer_merged_df.append(r_df, ignore_index=True)
32+
if not referrer_merged_df.empty:
33+
referrer_merged_df.drop_duplicates(
34+
subset=DROP_DUPLICATES_SUBSET, keep="last", inplace=True
35+
)
36+
log.info(f"Referrer record number: {len(referrer_merged_df)}")
37+
referrer_outfile = p_data / "merged_referrer.csv"
38+
log.info(f"Writing: {referrer_outfile}")
39+
referrer_merged_df.to_csv(referrer_outfile, header=False, index=False)
40+
41+
merged_files = [x for x in sorted(p_data.glob("merged_*.csv"))]
2242
if merged_files:
23-
model_list = [Clone, Paths, Traffic]
43+
model_list = [Clone, Paths, Referring, Traffic]
2444
for file, model in zip(merged_files, model_list):
2545
db.migrate_csv(file, model=model, engine=engine)
2646
else:

0 commit comments

Comments
 (0)
0