From be5396a615f54be6e89646f1a8cb473b701e662a Mon Sep 17 00:00:00 2001
From: marius <11855163+norandom@users.noreply.github.com>
Date: Sat, 21 Sep 2024 09:51:59 +0200
Subject: [PATCH] zipline yfinance import with different exchange calendars

---
 Zipline/free_data_zipline_bundles.py | 172 +++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 Zipline/free_data_zipline_bundles.py

diff --git a/Zipline/free_data_zipline_bundles.py b/Zipline/free_data_zipline_bundles.py
new file mode 100644
index 0000000..f49bb5a
--- /dev/null
+++ b/Zipline/free_data_zipline_bundles.py
@@ -0,0 +1,172 @@
+import os
+import shutil
+import warnings
+from datetime import datetime, timedelta
+
+import numpy as np
+import pandas as pd
+import pytz
+import yfinance as yf
+from zipline.data import bundles
+from zipline.data.bundles import register
+from zipline.data.bundles.csvdir import csvdir_equities
+from zipline.data.data_portal import DataPortal
+from zipline.utils.calendar_utils import get_calendar
+from zipline.utils.cli import maybe_show_progress
+
+# Disable pandas PerformanceWarnings and FutureWarnings
+warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+
+
+def safe_float_to_uint32(value):
+    """Safely convert float to uint32, clamping values if necessary."""
+    return np.uint32(max(0, min(value, np.iinfo(np.uint32).max)))
+
+
+def process_symbol_data(df, trading_days, missing_dates):
+    """
+    Processes the given dataframe (df) for a specific symbol,
+    adjusts the data, and fills missing dates.
+
+    Parameters:
+    df (pandas DataFrame): The input dataframe containing the symbol's data.
+    trading_days (list): A list of trading days for the specified symbol.
+    missing_dates (list): A list of missing dates for the specified symbol.
+
+    Returns:
+    pandas DataFrame: A processed dataframe with adjusted data and missing dates filled.
+
+    The function first sets the column names of the dataframe. It then drops any rows with missing values.
+    An 'adj_factor' is calculated by dividing the 'adj_close' by the 'close' column.
+    The 'open', 'high', 'low', and 'close' columns are then multiplied by the 'adj_factor'.
+    The 'adj_close' and 'adj_factor' columns are then dropped from the dataframe.
+    The index of the dataframe is renamed to 'date'.
+    The 'dividend' and 'split' columns are initialized with 0.0 and 1.0 respectively.
+    The dataframe is then reindexed to match the trading_days list.
+    The dataframe is filled forward (ffill) to fill any missing values.
+    For each missing date in the missing_dates list, if it's not already in the dataframe, the previous date's values are copied to that missing date.
+    The dataframe is then sorted by the index (date).
+    The 'open', 'high', 'low', and 'close' columns are then rounded to the nearest integer and converted to uint32 data type.
+    The 'volume' column is filled with 0 if it's missing and clipped to ensure it's a non-negative integer. It's then converted to uint32 data type.
+
+    The processed dataframe is then returned.
+    """
+
+
+def main():
+    symbols = ['AAPL', 'NFLX', 'NVDA', 'JPM', 'SPY',
+               "GC=F", "SI=F", "CL=F", "ZW=F", "PL=F", "ZC=F", "ZS=F", "KC=F",
+               "CC=F", "^GDAXI", "^GSPC"]
+
+    START_DATE = "2010-01-04"
+    END_DATE = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d")  # Use yesterday's date
+
+    print(f"Downloading data for {len(symbols)} symbols from {START_DATE} to {END_DATE}")
+    stocks_df = yf.download(tickers=symbols, start=START_DATE, end=END_DATE, group_by='ticker')
+    print("Data download completed")
+
+    calendar = get_calendar('XNYS')
+    start_session = pd.Timestamp(START_DATE)
+    end_session = pd.Timestamp(END_DATE)
+
+    # Use pandas date_range instead of calendar.sessions_window
+    all_days = pd.date_range(start=start_session, end=end_session, freq='D')
+    trading_days = [day for day in all_days if calendar.is_session(day)]
+
+    print(f"Total NYSE trading days: {len(trading_days)}")
+
+    base_dir = os.path.expanduser('~/.zipline/custom_data/daily')
+    os.makedirs(base_dir, exist_ok=True)
+
+    missing_dates = [pd.Timestamp('2011-01-03'), pd.Timestamp('2016-10-10'), pd.Timestamp('2016-11-11')]
+
+    for symbol in symbols:
+        if symbol in stocks_df.columns.get_level_values(0):
+            df = process_symbol_data(stocks_df[symbol].copy(), trading_days, missing_dates)
+            print(f"\nSymbol: {symbol}")
+            print(f"Date range: {df.index.min()} to {df.index.max()}")
+            print(f"Number of rows: {len(df)}")
+            print(f"Unique dates: {df.index.nunique()}")
+            print("Sample of data:")
+            print(df.head())
+            csv_file_path = os.path.join(base_dir, f'{symbol}.csv')
+            df.to_csv(csv_file_path)
+            print(f"CSV file saved: {csv_file_path}")
+
+    print("\nAll CSV files have been created")
+
+    bundle_path = os.path.expanduser("~/.zipline/data/custom_data")
+    if os.path.exists(bundle_path):
+        shutil.rmtree(bundle_path)
+        print(f"Removed existing bundle data: {bundle_path}")
+
+    CSV_FOLDER_PATH = os.path.expanduser("~/.zipline/custom_data")
+
+    print("\nRegistering the bundle")
+    register(
+        'custom_data',
+        csvdir_equities(['daily'], CSV_FOLDER_PATH),
+        calendar_name='XNYS',
+    )
+
+    print("Ingesting the bundle")
+    try:
+        bundles.ingest('custom_data')
+        print("Bundle ingestion completed successfully")
+    except Exception as e:
+        print(f"Error during bundle ingestion: {str(e)}")
+        raise
+
+    print("\nLoading the bundle")
+    try:
+        bundle = bundles.load('custom_data')
+        print("Bundle loaded successfully")
+    except Exception as e:
+        print(f"Error loading bundle: {str(e)}")
+        raise
+
+    assets = bundle.asset_finder.retrieve_all(bundle.asset_finder.sids)
+    print(f"Bundle contains {len(assets)} assets")
+
+    print("\nCreating DataPortal for data verification")
+    try:
+        data_portal = DataPortal(
+            asset_finder=bundle.asset_finder,
+            trading_calendar=calendar,
+            first_trading_day=bundle.equity_daily_bar_reader.first_trading_day,
+            equity_daily_reader=bundle.equity_daily_bar_reader,
+            adjustment_reader=bundle.adjustment_reader
+        )
+        print("DataPortal created successfully")
+    except Exception as e:
+        print(f"Error creating DataPortal: {str(e)}")
+        raise
+
+    print("\nRetrieving historical data")
+    try:
+        bar_count = len(trading_days)
+
+        panel = data_portal.get_history_window(
+            assets=assets,
+            end_dt=end_session,
+            bar_count=bar_count,
+            frequency='1d',
+            field='close',
+            data_frequency='daily'
+        )
+        print("Historical data retrieved successfully")
+        print(f"\nData retrieved for {len(assets)} assets")
+        print(f"Date range: {panel.index[0]} to {panel.index[-1]}")
+        print(f"Number of trading days: {len(panel)}")
+        print("\nSample of retrieved data:")
+        print(panel.head())
+    except Exception as e:
+        print(f"Error retrieving historical data: {str(e)}")
+        raise
+
+    print("\nScript completed successfully")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file