Virtual Field Survey Workflow Showcase of Sandhill Cranes and Their Habitats in North America
Geospatial Data Science Project - Part 3: This notebook documents the workflow for Google Street View 'Birding,' which encompasses processes such as sample retrieval, Google Street View metadata and image downloads, and more.
By ZJ Zhou (zhijiez2@illinois.edu)
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from shapely.geometry import Point
!pip install streetview
from streetview import search_panoramas, get_panorama_meta, get_panorama
!pip install earthengine-api
!pip install geemap
import geemap
from pathlib import Path
import os
import ee
import glob
import csv
# Create a Path object for the current directory
current_directory = Path.cwd()
# Get the parent directory
parent_directory = current_directory.parent
print("Current Directory:", current_directory)
print("Parent Directory:", parent_directory)
# Read CSV file into a pandas DataFrame
df = pd.read_csv('observations-360459.csv')
# Convert the "observed_on" column to a datetime data type
df['observed_on'] = pd.to_datetime(df['observed_on'])
# Extract the year and month
df['year'] = df['observed_on'].dt.year
df['month'] = df['observed_on'].dt.month
# Get the basic summary information of the dataset
# df.info()
# Filter out observation records without valid latitude or longitude entries
# also, we filter out those without valid observed time entries OR dated before the year of 2008
# Google Street View images are not available before the year of 2008
filtered_df = df[df['latitude'].notna() & df['longitude'].notna() & df['observed_on'].notna() & (df['year'] >= 2008)]
# Filter out observation records outside of North America
filtered_df = filtered_df[(filtered_df['place_country_name'] == 'United States') |
(filtered_df['place_country_name'] == 'Canada') |
(filtered_df['place_country_name'] == 'Mexico')]
print(f'In total, we have {str(len(filtered_df))} observation records of Sandhill Cranes across the North America since 2008!!!')
# Define the bounding box coordinates for North America in WGS 84
north_america_bbox = (-180, 20, -50, 85) # (minx, miny, maxx, maxy)
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# Set up the figure
fig, ax = plt.subplots(figsize=(10, 6))
# Set the extent of the plot to the North America bounding box in WGS 84
ax.set_xlim(north_america_bbox[0], north_america_bbox[2])
ax.set_ylim(north_america_bbox[1], north_america_bbox[3])
# Build geopandas dataframe for mapping
gdf = gpd.GeoDataFrame(filtered_df, geometry=gpd.points_from_xy(filtered_df.longitude, filtered_df.latitude))
world.boundary.plot(ax=ax, linewidth=1)
gdf.plot(ax=ax, color='darkgreen', markersize=10)
plt.title('Locations of iNaturalist Sandhill Crane Observations, North America')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Plot the distribution by year
unique_years = filtered_df['year'].unique()
axes[0].hist(filtered_df['year'], bins=len(unique_years), edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Year')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution by Year')
axes[0].set_xticks(filtered_df['year'].unique())
axes[0].set_xticklabels(filtered_df['year'].unique(), rotation=90, ha="center")
# Plot the distribution by month
unique_months = filtered_df['month'].unique()
axes[1].hist(filtered_df['month'], bins=len(unique_months), edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Month')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution by Month')
axes[1].set_xticks(filtered_df['month'].unique())
fig.suptitle('Temporal Distribution of Sandhill Cranes in North America (2008-2023)\n', fontsize=16)
# plt.tight_layout()
# Add a caption below the plots
caption = "Data Source: iNaturalist (38,559 records)."
plt.figtext(0.5, -0.05, caption, ha="center", fontsize=10, color='grey')
plt.show()
# Combine region info for visualization purpose
top_20_states = filtered_df['place_admin1_name'].value_counts().head(20).index.tolist()
# Replace state names not in the top 20 with "Other Regions"
temp_state_viz = filtered_df
temp_state_viz.loc[~temp_state_viz['place_admin1_name'].isin(top_20_states), 'place_admin1_name'] = "Other Regions"
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Plot the distribution by country
unique_countries = filtered_df['place_country_name'].unique()
axes[0].hist(filtered_df['place_country_name'], bins=len(unique_countries), edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Country')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution by Country')
axes[0].set_xticks(filtered_df['place_country_name'].unique())
axes[0].set_xticklabels(unique_countries, rotation=90, ha="center")
# Plot the distribution by state/province
unique_regions = temp_state_viz['place_admin1_name'].unique()
axes[1].hist(temp_state_viz['place_admin1_name'], bins=len(unique_regions), edgecolor='black', alpha=0.7)
axes[1].set_xlabel('State/Province')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution by State/Province')
axes[1].set_xticks(unique_regions)
axes[1].set_xticklabels(unique_regions, rotation=90, ha="center")
fig.suptitle('Spatial Distribution of Sandhill Cranes in North America (2008-2023)\n', fontsize=16)
# plt.tight_layout()
# Add a caption below the plots
caption = "Data Source: iNaturalist (38,559 records)."
plt.figtext(0.5, -0.1, caption, ha="center", fontsize=10, color='grey')
plt.show()
# Extract coordinates for spatial clustering purpose
coordinates = list(zip(filtered_df['latitude'], filtered_df['longitude']))
# Set the number of clusters as 10
num_clusters = 10
kmeans = KMeans(num_clusters)
# Perform the kmeans clustering algorithm
clusters = kmeans.fit_predict(coordinates)
# print(clusters)
# Add the cluster labels to the original DataFrame
filtered_df['Cluster'] = clusters
# Define the number of samples per cluster
samples_per_cluster = 100
# Group the DataFrame by 'Cluster' and sample within each group
# Note that if we would like to reproduce the same results, keep a specific random_state parameter for reproducibility
sampled_df = filtered_df.groupby('Cluster').apply(lambda group: group.sample(samples_per_cluster, random_state = 999))
# Reset the index
sampled_df = sampled_df.reset_index(drop=True)
# sampled_df
# sampled_df.columns
print(f"We have successfully extracted {str(len(sampled_df))} samples from our clustered observation records!")
# Set Buffer Directory
buffer_dir = os.path.normpath(f"{current_directory}/sampled_buffers")
if not os.path.exists(buffer_dir):
os.mkdir(buffer_dir)
# Create a GeoDataFrame based on sampled_df
sampled_gdf = gpd.GeoDataFrame(sampled_df, geometry=gpd.points_from_xy(sampled_df.longitude, sampled_df.latitude))
# Specify WGS84 as the reference coordinate system for our sampled dataset
sampled_gdf = sampled_gdf.set_crs('EPSG:4326')
# Reproject it Web Mercator (in meters)
sampled_gdf = sampled_gdf.to_crs('EPSG:3857')
# Set buffer size in meters
buffer_size = 100
# Create an empty GeoDataFrame for the buffers to be generated
sampled_buffer_gdf = gpd.GeoDataFrame(columns=['id','geometry', 'year'])
# Generate buffers for the GeoDataFrame
buffer_size = 100 # Buffer size in meters
# Set the column entries of buffer dataset
sampled_buffer_gdf['id'] = sampled_gdf['id']
sampled_buffer_gdf['year'] = sampled_gdf['year']
sampled_buffer_gdf['geometry'] = sampled_gdf['geometry'].buffer(buffer_size)
# Save buffers by year
for YEAR in range(2013, 2024):
# Define the output shapefile name
output_buffer = os.path.normpath(f"{buffer_dir}/sampled_buffer_{YEAR}.shp")
buffer_gdf = sampled_buffer_gdf[sampled_buffer_gdf['year'] == YEAR]
# Save generated buffers in the specified directory (for back up purposes)
buffer_gdf.to_file(output_buffer, driver='ESRI Shapefile')
# To use geemap or any other Google Earth Engine related features in python, we need to authenticate it first.
ee.Authenticate()
ee.Initialize()
for YEAR in range(2013, 2024):
# Specify the buffer storage path
output_buffer = os.path.normpath(f"{buffer_dir}/sampled_buffer_{YEAR}.shp")
# Convert our shapefile to ee (Earth Engine) objects
buffer = geemap.shp_to_ee(output_buffer)
# Note that since the 2023 CDL have not be published yet, we use the 2022 CDL as a proxy
if YEAR == 2023:
dataset = ee.Image("USDA/NASS/CDL/" + str(2022))
else:
dataset = ee.Image("USDA/NASS/CDL/" + str(YEAR))
# Select the corresponding band in CDL dataset for crop type information
cropland = ee.Image(dataset.select('cropland'))
# Set up the path for CDL statistics output
cdl_stats = os.path.normpath(f'{current_directory}/cdl_stats_{YEAR}.csv')
# Use zonal_statistics_by_group to calculate the area ratio of each crop type within each buffer
# statistics_type can be either 'SUM' or 'PERCENTAGE'
# denominator can be used to convert square meters to other areal units, such as square kilometers
geemap.zonal_statistics_by_group(
cropland.clip(buffer),
buffer,
cdl_stats,
statistics_type='PERCENTAGE',
#denominator=1000000,
#decimal_places=2,
)
# Use the glob module to get a list of CSV file paths in a directory.
file_paths = glob.glob('cdl_stats_*')
# Create an empty list to store DataFrames, one for each CSV file
dataframes = []
# Read each CSV file into a DataFrame and append it to the list of DataFrames
for file_path in file_paths:
df = pd.read_csv(file_path)
dataframes.append(df)
# Concatenate the DataFrames vertically to merge them while retaining all columns
merged_df = pd.concat(dataframes, ignore_index=True)
# Fill missing values (empty entries) with 0
merged_df.fillna(0, inplace=True)
# Replace 'column_to_remove' with the name of the column you want to remove
column_to_remove = 'system:index'
if column_to_remove in merged_df.columns:
merged_df.drop(columns=[column_to_remove], inplace=True)
column_to_remove = 'Class_sum'
if column_to_remove in merged_df.columns:
merged_df.drop(columns=[column_to_remove], inplace=True)
# Save the merged DataFrame to a new CSV file if needed
merged_df.to_csv('merged_file.csv', index=False)
merged_df.columns
# Load the CSV file containing the crop mapping
codebook_df = pd.read_csv('CDL_crop_codebook.csv')
# Create a custom mapping dictionary from the codebook DataFrame
custom_mapping = dict(zip(codebook_df['Codes'], codebook_df['Class_Names']))
# Iterate through the DataFrame's column names
for col_name in merged_df.columns:
if (col_name != 'id') & (col_name != 'year'):
# Extract the numerical part from the column name
num = int(col_name.split('_')[1])
# Look up the corresponding crop name from the custom mapping
if num in custom_mapping:
crop_name = custom_mapping[num]
merged_df.rename(columns={col_name: crop_name}, inplace=True)
merged_df.columns
merged_df
corn_df = merged_df[merged_df['Corn'] > 0.5]
print(f"We got {len(corn_df)} points with larger than 50% of its 100-meter buffer covered by corn in our sample!!!")
corn_df['id'].tolist()
joined_corn_df = pd.merge(corn_df['id'], filtered_df, on='id')
joined_corn_df[['id','observed_on','latitude','longitude']]
# Convert values from two columns to lists
lat_list = joined_corn_df['latitude'].values.tolist()
lon_list = joined_corn_df['longitude'].values.tolist()
List_coord = list(zip(lat_list, lon_list))
List_coord
def downloadGSVMetaData(LIST_COORDINATES, TASK):
"""
Downloads Google Street View metadata for a list of coordinates.
Parameters:
- LIST_COORDINATES (list of tuples): A list of latitude and longitude coordinates to search for GSV imagery.
- TASK (str): A task identifier to be used in the generated metadata CSV file name.
This function searches for GSV imagery around the provided coordinates and saves metadata to a CSV file.
"""
# Generate the file path for the metadata CSV file
path = os.path.normpath("GSV_METADATA_{}.csv".format(TASK))
# Open the metadata CSV file for appending
metadata = open(path, "a")
# Create a CSV writer object with newline terminator
writer = csv.writer(metadata, lineterminator='\n')
# Write the header row to the CSV file
writer.writerow(['ID', 'Panoid', 'Lat', 'Lon', 'Heading', 'Pitch', 'Roll', 'Year', 'Month'])
# Initialize an index
index = 0
# Iterate through the list of coordinates (LAT, LON)
for LAT, LON in LIST_COORDINATES:
try:
# Use the search_panoramas function to get panorama information
panoids = search_panoramas(lat=LAT, lon=LON)
except:
# Handle errors if the search_panoramas function fails
print("Index out of range error at lat={}, lon={}: no GSV imagery in this area.".format(LAT, LON))
print("Searching GSV Images around...({}, {})".format(LAT, LON))
if len(panoids)>0:
for id in range(len(panoids)):
if panoids[id].date:
year = int(panoids[id].date[:4])
month= int(panoids[id].date[5:])
index += 1
metadata = open(path, "a")
writer = csv.writer(metadata, lineterminator = '\n')
writer.writerow([index, panoids[id].pano_id,panoids[id].lat,panoids[id].lon,panoids[id].heading,panoids[id].pitch, panoids[id].roll,year,month])
print(index)
else:
print('no GSV imagery found in this area')
continue
downloadGSVMetaData(List_coord, 'CORN')
gsv_df = pd.read_csv('GSV_METADATA_CORN.csv').drop_duplicates()
gsv_df
Panoids = gsv_df['Panoid'].iloc[1:len(gsv_df)].tolist()
# Set panoramas directory
pano_dir = os.path.normpath(f"{current_directory}/panorama/")
if not os.path.exists(pano_dir):
os.mkdir(pano_dir)
for Panoid in Panoids:
meta_info = gsv_df[gsv_df['Panoid'] == Panoid]
save_dir = f"{pano_dir}/{Panoid}_{meta_info['Lat'].values[0]}_{meta_info['Lon'].values[0]}_{meta_info['Heading'].values[0]}_{meta_info['Year'].values[0]}_{meta_info['Month'].values[0]}.jpg"
print(save_dir)
try:
image = get_panorama(Panoid)
except:
print(str(Panoid) + " has problems with downloading...")
continue
image.save(save_dir, "jpeg")
print('Saved Successfully!')
def tiles_info(panoid, width, length, zoom=5):
"""
Generate a list of a panorama's tiles and their position.
The format is (x, y, filename, fileurl)
"""
image_url = "https://cbks2.google.com/cbk?cb_client=maps_sv.tactile&authuser=%200&output=tile&hl=en&panoid={}&zoom={}&x={}&y={}"
# The tiles positions
coord = list(itertools.product(range(length), range(width)))
tiles = [(x, y, "%s_%dx%d.jpg" % (panoid, x, y), image_url.format(panoid, zoom, x, y)) for x, y in coord]
return tiles
def download_panorama_v3(panoid, width,length,zoom=5,panorama_path=None, disp=False):
'''
save image information in a buffer.
input:
panoid: which is an id of image on google maps
zoom: larger number -> higher resolution, from 1 to 5, better less than 3, some location will fail when zoom larger than 3
disp: verbose of downloading progress, basically you don't need it
output:
panorama image (uncropped)
'''
tile_width = 512
tile_height = 512
img_w, img_h = int(np.ceil(416*(2**zoom)/tile_width)*tile_width), int(np.ceil(416*( 2**(zoom-1) )/tile_width)*tile_width)
tiles = tiles_info( panoid, length,width,zoom=zoom)
valid_tiles = []
# function of download_tiles
for i, tile in enumerate(tiles):
x, y, fname, url = tile
if disp and i % 20 == 0:
print("Image %d / %d" % (i, len(tiles)))
if x*tile_width < img_w and y*tile_height < img_h: # tile is valid
# Try to download the image file
while True:
try:
response = requests.get(url, stream=True)
break
except requests.ConnectionError:
print("Connection error. Trying again in 2 seconds.")
time.sleep(2)
valid_tiles.append( Image.open(BytesIO(response.content)) )
del response
# function to stich
panorama = Image.new('RGB', (img_w, img_h))
i = 0
for x, y, fname, url in tiles:
if x*tile_width < img_w and y*tile_height < img_h: # tile is valid
tile = valid_tiles[i]
i+=1
panorama.paste(im=tile, box=(x*tile_width, y*tile_height))
im.imwrite(panorama_path, panorama)
# return np.array(panorama)