GDELT + Stock Data: Data Retrieval

31 Dec, 2024

I've compiled my previous posts into a single piece of code that I can reference to in later posts to avoid repetition. The code will collect together different indicators that I want to consider in my strategies and strategies I've found online.

from collections import Counter
from backtesting import Strategy
from backtesting.lib import crossover
from gdeltdoc import GdeltDoc, Filters
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
import numpy as np
from scipy.stats import pearsonr, zscore
import seaborn as sns

START_DATE = "2023-12-30"
END_DATE = "2024-12-30"
TICKER = "SPY"
SEARCH_TERM = "economy"

# Set up GDELT filters
f = Filters(
    keyword=SEARCH_TERM,
    start_date=START_DATE,
    end_date=END_DATE,
    #theme="WB_625_HEALTH_ECONOMICS_AND_FINANCE"
)

gd = GdeltDoc()
articles = gd.article_search(f)
timeline_tone = gd.timeline_search("timelinetone", f)
timeline_raw = gd.timeline_search("timelinevolraw", f)
timeline = timeline_tone.merge(timeline_raw, how="left", on="datetime")

def get_tone(df):
  tones = [0 for i in range(df.shape[0])]
  timeline["datetime_clean"] = timeline.datetime.dt.date
  for index, date in enumerate(df.index):
    if date.date() in list(timeline.datetime.dt.date):
      tones[index] = float(timeline[timeline.datetime_clean == date.date()]["Average Tone"])
    else:
      tones[index] = 0
  return pd.Series(tones)

def article_count(df): 
  count = [0 for i in range(df.shape[0])]
  timeline["datetime_clean"] = timeline.datetime.dt.date
  for index, date in enumerate(df.index):
    if date.date() in list(timeline.datetime.dt.date):
      count[index] = float(timeline[timeline.datetime_clean == date.date()]["Article Count"])
    else:
      count[index] = 0
  return pd.Series(count)

def article_all(df):
  count = [0 for i in range(df.shape[0])]
  timeline["datetime_clean"] = timeline.datetime.dt.date
  for index, date in enumerate(df.index):
    if date.date() in list(timeline.datetime.dt.date):
      count[index] = float(timeline[timeline.datetime_clean == date.date()]["All Articles"])
    else:
      count[index] = 0
  return pd.Series(count)

def get_volume(df):
   return df.Volume

data = yf.download(TICKER, start=START_DATE, end=END_DATE, multi_level_index=False)

data["Average Tone"] = list(get_tone(data))
data["Article Count"] = list(get_article_count(data))
data["All Articles"] = list(get_all_count(data))

def SMA(values, n):
    return pd.Series(values).rolling(n).mean()
data

	Close	Volume	Average Tone	Article Count	All Articles
Date
2024-01-02	466.663940	123623700	0.1338	7402.0	113699.0
2024-01-03	462.852844	103585900	-0.0392	8467.0	140100.0
2024-01-04	461.361969	84232200	-0.0177	11418.0	171729.0
2024-01-05	461.993866	86060800	0.0540	9509.0	159932.0
2024-01-08	468.589294	74879100	0.1491	8584.0	145645.0
...	...	...	...	...	...
2024-12-20	591.150024	125716700	0.4267	11918.0	170707.0
2024-12-23	594.690002	57635800	0.3075	10892.0	169788.0
2024-12-24	601.299988	33160100	0.4874	8868.0	132293.0
2024-12-26	601.340027	41219100	0.4670	9350.0	142705.0
2024-12-27	595.010010	64847900	0.1646	10576.0	158638.0

250 rows × 5 columns

Note that I've dropped some columns from the output to avoid styling issues with the post.