Realtime

GDELT + Stock Data: Data Retrieval

I've compiled my previous posts into a single piece of code that I can reference to in later posts to avoid repetition. The code will collect together different indicators that I want to consider in my strategies and strategies I've found online.

from collections import Counter
from backtesting import Strategy
from backtesting.lib import crossover
from gdeltdoc import GdeltDoc, Filters
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
import numpy as np
from scipy.stats import pearsonr, zscore
import seaborn as sns

START_DATE = "2023-12-30"
END_DATE = "2024-12-30"
TICKER = "SPY"
SEARCH_TERM = "economy"

# Set up GDELT filters
f = Filters(
    keyword=SEARCH_TERM,
    start_date=START_DATE,
    end_date=END_DATE,
    #theme="WB_625_HEALTH_ECONOMICS_AND_FINANCE"
)

gd = GdeltDoc()
articles = gd.article_search(f)
timeline_tone = gd.timeline_search("timelinetone", f)
timeline_raw = gd.timeline_search("timelinevolraw", f)
timeline = timeline_tone.merge(timeline_raw, how="left", on="datetime")

def get_tone(df):
  tones = [0 for i in range(df.shape[0])]
  timeline["datetime_clean"] = timeline.datetime.dt.date
  for index, date in enumerate(df.index):
    if date.date() in list(timeline.datetime.dt.date):
      tones[index] = float(timeline[timeline.datetime_clean == date.date()]["Average Tone"])
    else:
      tones[index] = 0
  return pd.Series(tones)

def article_count(df): 
  count = [0 for i in range(df.shape[0])]
  timeline["datetime_clean"] = timeline.datetime.dt.date
  for index, date in enumerate(df.index):
    if date.date() in list(timeline.datetime.dt.date):
      count[index] = float(timeline[timeline.datetime_clean == date.date()]["Article Count"])
    else:
      count[index] = 0
  return pd.Series(count)

def article_all(df):
  count = [0 for i in range(df.shape[0])]
  timeline["datetime_clean"] = timeline.datetime.dt.date
  for index, date in enumerate(df.index):
    if date.date() in list(timeline.datetime.dt.date):
      count[index] = float(timeline[timeline.datetime_clean == date.date()]["All Articles"])
    else:
      count[index] = 0
  return pd.Series(count)

def get_volume(df):
   return df.Volume

data = yf.download(TICKER, start=START_DATE, end=END_DATE, multi_level_index=False)

data["Average Tone"] = list(get_tone(data))
data["Article Count"] = list(get_article_count(data))
data["All Articles"] = list(get_all_count(data))

def SMA(values, n):
    return pd.Series(values).rolling(n).mean()
data
Close Volume Average Tone Article Count All Articles
Date
2024-01-02 466.663940 123623700 0.1338 7402.0 113699.0
2024-01-03 462.852844 103585900 -0.0392 8467.0 140100.0
2024-01-04 461.361969 84232200 -0.0177 11418.0 171729.0
2024-01-05 461.993866 86060800 0.0540 9509.0 159932.0
2024-01-08 468.589294 74879100 0.1491 8584.0 145645.0
... ... ... ... ... ...
2024-12-20 591.150024 125716700 0.4267 11918.0 170707.0
2024-12-23 594.690002 57635800 0.3075 10892.0 169788.0
2024-12-24 601.299988 33160100 0.4874 8868.0 132293.0
2024-12-26 601.340027 41219100 0.4670 9350.0 142705.0
2024-12-27 595.010010 64847900 0.1646 10576.0 158638.0

250 rows × 5 columns

Note that I've dropped some columns from the output to avoid styling issues with the post.