GDELT + Stock Data: Data Retrieval
I've compiled my previous posts into a single piece of code that I can reference to in later posts to avoid repetition. The code will collect together different indicators that I want to consider in my strategies and strategies I've found online.
from collections import Counter
from backtesting import Strategy
from backtesting.lib import crossover
from gdeltdoc import GdeltDoc, Filters
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
import numpy as np
from scipy.stats import pearsonr, zscore
import seaborn as sns
START_DATE = "2023-12-30"
END_DATE = "2024-12-30"
TICKER = "SPY"
SEARCH_TERM = "economy"
# Set up GDELT filters
f = Filters(
keyword=SEARCH_TERM,
start_date=START_DATE,
end_date=END_DATE,
#theme="WB_625_HEALTH_ECONOMICS_AND_FINANCE"
)
gd = GdeltDoc()
articles = gd.article_search(f)
timeline_tone = gd.timeline_search("timelinetone", f)
timeline_raw = gd.timeline_search("timelinevolraw", f)
timeline = timeline_tone.merge(timeline_raw, how="left", on="datetime")
def get_tone(df):
tones = [0 for i in range(df.shape[0])]
timeline["datetime_clean"] = timeline.datetime.dt.date
for index, date in enumerate(df.index):
if date.date() in list(timeline.datetime.dt.date):
tones[index] = float(timeline[timeline.datetime_clean == date.date()]["Average Tone"])
else:
tones[index] = 0
return pd.Series(tones)
def article_count(df):
count = [0 for i in range(df.shape[0])]
timeline["datetime_clean"] = timeline.datetime.dt.date
for index, date in enumerate(df.index):
if date.date() in list(timeline.datetime.dt.date):
count[index] = float(timeline[timeline.datetime_clean == date.date()]["Article Count"])
else:
count[index] = 0
return pd.Series(count)
def article_all(df):
count = [0 for i in range(df.shape[0])]
timeline["datetime_clean"] = timeline.datetime.dt.date
for index, date in enumerate(df.index):
if date.date() in list(timeline.datetime.dt.date):
count[index] = float(timeline[timeline.datetime_clean == date.date()]["All Articles"])
else:
count[index] = 0
return pd.Series(count)
def get_volume(df):
return df.Volume
data = yf.download(TICKER, start=START_DATE, end=END_DATE, multi_level_index=False)
data["Average Tone"] = list(get_tone(data))
data["Article Count"] = list(get_article_count(data))
data["All Articles"] = list(get_all_count(data))
def SMA(values, n):
return pd.Series(values).rolling(n).mean()
data
Close | Volume | Average Tone | Article Count | All Articles | |
---|---|---|---|---|---|
Date | |||||
2024-01-02 | 466.663940 | 123623700 | 0.1338 | 7402.0 | 113699.0 |
2024-01-03 | 462.852844 | 103585900 | -0.0392 | 8467.0 | 140100.0 |
2024-01-04 | 461.361969 | 84232200 | -0.0177 | 11418.0 | 171729.0 |
2024-01-05 | 461.993866 | 86060800 | 0.0540 | 9509.0 | 159932.0 |
2024-01-08 | 468.589294 | 74879100 | 0.1491 | 8584.0 | 145645.0 |
... | ... | ... | ... | ... | ... |
2024-12-20 | 591.150024 | 125716700 | 0.4267 | 11918.0 | 170707.0 |
2024-12-23 | 594.690002 | 57635800 | 0.3075 | 10892.0 | 169788.0 |
2024-12-24 | 601.299988 | 33160100 | 0.4874 | 8868.0 | 132293.0 |
2024-12-26 | 601.340027 | 41219100 | 0.4670 | 9350.0 | 142705.0 |
2024-12-27 | 595.010010 | 64847900 | 0.1646 | 10576.0 | 158638.0 |
250 rows × 5 columns