|
| 1 | +import os |
| 2 | +import pandas as pd # type: ignore |
| 3 | +from collections import defaultdict |
| 4 | +import pickle |
| 5 | +import json |
| 6 | +from typing import DefaultDict, Dict, Any, BinaryIO |
| 7 | + |
| 8 | +# Directory containing the daily CSV files |
| 9 | +data_dir = "./aggregates_day/" |
| 10 | + |
| 11 | +# Initialize a dictionary to hold trades data |
| 12 | +trades_data = defaultdict(list) |
| 13 | + |
| 14 | +# List all CSV files in the directory |
| 15 | +files = sorted([f for f in os.listdir(data_dir) if f.endswith(".csv")]) |
| 16 | + |
| 17 | +print("Starting to process files...") |
| 18 | + |
| 19 | +# Process each file (assuming files are named in order) |
| 20 | +for file in files: |
| 21 | + print(f"Processing {file}") |
| 22 | + file_path = os.path.join(data_dir, file) |
| 23 | + df = pd.read_csv(file_path) |
| 24 | + # For each stock, store the date and relevant data |
| 25 | + for _, row in df.iterrows(): |
| 26 | + ticker = row["ticker"] |
| 27 | + date = pd.to_datetime(row["window_start"], unit="ns").date() |
| 28 | + trades = row["transactions"] |
| 29 | + close_price = row["close"] # Ensure 'close' column exists in your CSV |
| 30 | + trades_data[ticker].append( |
| 31 | + {"date": date, "trades": trades, "close_price": close_price} |
| 32 | + ) |
| 33 | + |
| 34 | +print("Finished processing files.") |
| 35 | +print("Building lookup table...") |
| 36 | + |
| 37 | +# Now, build the lookup table with rolling averages and percentage price change |
| 38 | +lookup_table: DefaultDict[str, Dict[str, Any]] = defaultdict( |
| 39 | + dict |
| 40 | +) # Nested dict: ticker -> date -> stats |
| 41 | + |
| 42 | +for ticker, records in trades_data.items(): |
| 43 | + # Convert records to DataFrame |
| 44 | + df_ticker = pd.DataFrame(records) |
| 45 | + # Sort records by date |
| 46 | + df_ticker.sort_values("date", inplace=True) |
| 47 | + df_ticker.set_index("date", inplace=True) |
| 48 | + |
| 49 | + # Calculate the percentage change in close_price |
| 50 | + df_ticker["price_diff"] = ( |
| 51 | + df_ticker["close_price"].pct_change() * 100 |
| 52 | + ) # Multiply by 100 for percentage |
| 53 | + |
| 54 | + # Shift trades to exclude the current day from rolling calculations |
| 55 | + df_ticker["trades_shifted"] = df_ticker["trades"].shift(1) |
| 56 | + # Calculate rolling average and standard deviation over the previous 5 days |
| 57 | + df_ticker["avg_trades"] = df_ticker["trades_shifted"].rolling(window=5).mean() |
| 58 | + df_ticker["std_trades"] = df_ticker["trades_shifted"].rolling(window=5).std() |
| 59 | + # Store the data in the lookup table |
| 60 | + for date, row in df_ticker.iterrows(): |
| 61 | + # Convert date to string for JSON serialization |
| 62 | + date_str = date.strftime("%Y-%m-%d") |
| 63 | + # Ensure rolling stats are available |
| 64 | + if pd.notnull(row["avg_trades"]) and pd.notnull(row["std_trades"]): |
| 65 | + lookup_table[ticker][date_str] = { |
| 66 | + "trades": row["trades"], |
| 67 | + "close_price": row["close_price"], |
| 68 | + "price_diff": row["price_diff"], |
| 69 | + "avg_trades": row["avg_trades"], |
| 70 | + "std_trades": row["std_trades"], |
| 71 | + } |
| 72 | + else: |
| 73 | + # Store data without rolling stats if not enough data points |
| 74 | + lookup_table[ticker][date_str] = { |
| 75 | + "trades": row["trades"], |
| 76 | + "close_price": row["close_price"], |
| 77 | + "price_diff": row["price_diff"], |
| 78 | + "avg_trades": None, |
| 79 | + "std_trades": None, |
| 80 | + } |
| 81 | + |
| 82 | +print("Lookup table built successfully.") |
| 83 | + |
| 84 | +# Convert defaultdict to regular dict for JSON serialization |
| 85 | +lookup_table_dict = {k: v for k, v in lookup_table.items()} |
| 86 | + |
| 87 | +# Save the lookup table to a file for later use |
| 88 | +with open("lookup_table.pkl", "wb") as f: # type: BinaryIO |
| 89 | + pickle.dump(lookup_table_dict, f) |
| 90 | + |
| 91 | +print("Lookup table saved to 'lookup_table.pkl'.") |
0 commit comments