In this notebook, We will go through how to use Google BigQuery to access the GDELT project data, and some simple usage cases.
Useful References:
!pip install --upgrade 'google-cloud-bigquery[bqstorage,pandas]'
!pip install db-dtypes
!pip install basemap
!pip install networkx
import os
import sys
import math
import numpy as np
import geopandas as gpd
from google.cloud import bigquery
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
import networkx as nx
%load_ext google.cloud.bigquery
Authenticate with Google Cloud:
You need to authenticate your Python code to access Google Cloud resources. You can do this by creating a service account and downloading the JSON key file.
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "path-to-CREDENTIALS.json"
# Setting your Google application credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "iguide-gdelt.json"
# Construct a BigQuery client object.
client = bigquery.Client()
sql = """
SELECT
plurality,
COUNT(1) AS count,
year
FROM
`bigquery-public-data.samples.natality`
WHERE
NOT IS_NAN(plurality) AND plurality > 1
GROUP BY
plurality, year
ORDER BY
count DESC
"""
df = client.query(sql).to_dataframe()
df.head()
The database structure is project --> dataset --> table. The project id for the GDELT project in Google BigQuery is "gdelt-bq"
.
First, we investigate what datasets and tables are accessible.
# Project id of the GDELT project
project_id = "gdelt-bq"
# Create a BigQuery client object
client = bigquery.Client(project=project_id)
# List all datasets and tables in the GDELT project
datasets = list(client.list_datasets())
# Iterate through each dataset and list its tables
for dataset in datasets:
print("Dataset: {}".format(dataset.dataset_id))
tables = list(client.list_tables(dataset))
table_names = [table.table_id for table in tables]
print("Tables: {}\n--------".format(", ".join(table_names)))
There are two ways to check tables in a certain dataset, you can use either use the BigQuery client library directly or bigquery magic function %%bigquery
. The magic function %%bigquery
takes SQL as input and generate the pandas table directly.
a. BigQuery client library
# Check specific dataset
dataset_ref = client.dataset("full")
dataset = client.get_dataset(dataset_ref)
tables = list(client.list_tables(dataset))
for table in tables:
print(table.table_id)
# Do not put project="gdelt-bq" in since this create a new table
client = bigquery.Client()
sql = """
SELECT * FROM `gdelt-bq`.full.INFORMATION_SCHEMA.TABLES;
"""
table_df_1 = client.query(sql).to_dataframe()
table_df_1.head()
b. magic function %%bigquery
%%bigquery
SELECT * FROM `gdelt-bq.full.INFORMATION_SCHEMA.TABLES`
%%bigquery table_df_2
SELECT * FROM `gdelt-bq.full.INFORMATION_SCHEMA.TABLES`
# bigquery result saved to pandas table
table_df_2.head()
Bigquery magic functions can be used to check dataset schema.
%%bigquery
SELECT * FROM `gdelt-bq.full.INFORMATION_SCHEMA.COLUMNS`
WHERE table_name = 'events';
%%bigquery
SELECT * FROM `gdelt-bq.full.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS`
WHERE table_name = 'events';
%%bigquery
SELECT *
FROM `gdelt-bq.full.events`
LIMIT 5
%%bigquery usa_event_df
SELECT Actor1CountryCode, Actor2CountryCode, count(*) as count
FROM `gdelt-bq.full.events`
WHERE ((Actor1CountryCode = 'USA' and Actor2CountryCode IS NOT NULL)
or (Actor2CountryCode = 'USA' and Actor1CountryCode IS NOT NULL))
and Actor1CountryCode != Actor2CountryCode
and Year = 2022
GROUP BY 1,2
usa_event_df
# Create a new column with the sorted Actor1CountryCode and Actor2CountryCode
usa_event_df['sorted_pair'] = usa_event_df.apply(lambda row: tuple(sorted([row['Actor1CountryCode'], row['Actor2CountryCode']])), axis=1)
# Group by the sorted pair and sum the counts
grouped_df = usa_event_df.groupby('sorted_pair').agg({'count': 'sum'}).reset_index()
max_val = np.log10(grouped_df['count'].max())
def get_alpha(count):
'''
Convert a count to an alpha val.
Log-scaled
'''
scale = np.log10(count)
return (scale/max_val) * 0.25
plt.figure(figsize=(18,18))
# Load the world shape map
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# Country centroids
world['centroid'] = world.centroid
centroids = world.set_index('iso_a3')['centroid']
event_map = Basemap(projection='moll',
llcrnrlat=-80,urcrnrlat=80,
llcrnrlon=-180,urcrnrlon=180,
lon_0=0, lat_0=0)
event_map.drawcoastlines()
event_map.drawcountries()
event_map.fillcontinents(color='0.6')
event_map.drawmapboundary()
# Assuming grouped_df is defined somewhere before this
for index, row in grouped_df.iterrows():
country1, country2 = row['sorted_pair']
if country1 in centroids.index and country2 in centroids.index:
point1 = centroids.loc[country1]
point2 = centroids.loc[country2]
# Convert centroids to lat-long for Basemap
lon1, lat1 = point1.xy
lon2, lat2 = point2.xy
line_alpha = get_alpha(row['count'])
event_map.drawgreatcircle(lon1[0], lat1[0], lon2[0], lat2[0],
linewidth=2,
color='r',
alpha=line_alpha)
plt.show()
%%bigquery
SELECT * FROM `gdelt-bq.gdeltv2.INFORMATION_SCHEMA.COLUMN_FIELD_PATHS`
WHERE table_name = 'geg_gcnlapi';
%%bigquery entity_graph
SELECT Source, Target, Count RawCount, "Undirected" Type, ( Count/SUM(Count) OVER () ) Weight FROM (
SELECT a.entity Source, b.entity Target, COUNT(*) as Count
FROM (
(SELECT url, entities.name entity FROM `gdelt-bq.gdeltv2.geg_gcnlapi`, unnest(entities) entities
where entities.mid is not null and entities.type = 'PERSON'
and date >= "2019-02-05 00:00:00" AND date < "2019-02-06 00:00:00")
) a
JOIN (
(SELECT url, entities.name entity FROM `gdelt-bq.gdeltv2.geg_gcnlapi`, unnest(entities) entities
where entities.mid is not null and entities.type = 'PERSON'
and date >= "2019-02-05 00:00:00" AND date < "2019-02-06 00:00:00")
) b
ON a.url=b.url
WHERE a.entity<b.entity
GROUP BY 1,2
ORDER BY 3 DESC
LIMIT 1500
)
order by Count Desc
entity_graph.head()
entity_graph_filtered = entity_graph[(entity_graph['Source'] == 'Donald Trump') | (entity_graph['Target'] == 'Donald Trump')]
entity_graph_filtered
g = nx.Graph()
for index, row in entity_graph_filtered.iterrows():
g.add_edge(row['Source'], row['Target'], weight=row['Weight'])
# Set up with a white background and no grid lines
plt.figure(figsize=(20, 15))
plt.title('People most associated in global news with Donald Trump', y=0.97, fontsize=20, fontweight='bold')
cf = plt.gcf()
ax = cf.gca()
ax.set_axis_off()
# Getting the largest connected component
connected_components = (g.subgraph(c) for c in nx.connected_components(g))
gmax = max(connected_components, key=len)
d = nx.degree(gmax)
b = nx.betweenness_centrality(gmax).items()
# # Use a circular layout
pos = nx.circular_layout(gmax)
edge_weights = nx.get_edge_attributes(gmax, 'weight')
normalized_weights = [weight/max(edge_weights.values()) for weight in edge_weights.values()]
nx.draw_networkx_edges(gmax, pos, nodelist=[v[0] for v in b], alpha=0.9, width=normalized_weights*20)
nx.draw_networkx_nodes(gmax, pos, nodelist=[v[0] for v in b], node_size=[v[1] * 20 for v in d],
node_color=[math.log(1 + v[1] * 20) for v in b], cmap=plt.cm.Blues)
nx.draw_networkx_labels(gmax, pos, labels={v[0]: v[0] for v in b})
# Display the graph
plt.show()