Q1. Write a brief description of the project you would like to pursue on CyberGISX. Specify the data sources you would be using for the project. Describe the analysis/modeling/simulation scenarios you hope to achieve this semester using CyberGISX.
I am interested in comparing which states have underserved pediatric patients compared to others. The following links contain the pediatric census data andP general pediatricians employed within their respective states. Pediatric data: https://data.census.gov/table?q=Children%20by%20state Pediatrician data: https://data.bls.gov/oes/#/occGeo/One%20occupation%20for%20multiple%20geographical%20areas
I will have to use skills learned in previous classes to merge information and calculate statistics but i will then use follium to generate maps by state that are interactive.
Q2. Using techniques, we have learned in Module 3 please develop a visualization of any of the components of the datasets you have chosen.
# Import the pandas library for data manipulation
import pandas as pd
# Import the numpy library for numerical operations (not used in this specific code block)
import numpy as np
# Import the folium library for creating web maps
import folium
# Setting the data path (location of your data files)
data_path = "/home/jovyan/shared_data/data/geog407/assignment3/"
# Identify the GeoJSON file containing US state boundaries
state_geo = f"{data_path}/us-states.json"
# Read the pediatric data from a CSV file using pandas
population_data = pd.read_csv('Pediatric_data.csv')#, dtype={'column1': int, 'column2': float})
# Read the doctor data from a CSV file using pandas
# - skiprows=5: Skips the first 5 rows of the CSV (assuming they are headers)
# - usecols=[0,1]: Selects only the first two columns (adjust if needed)
doctor_data = pd.read_csv('Physician_Report.csv', skiprows=5, usecols=[0, 1])
# Display the doctor data (optional)
print(doctor_data) # This line will print the contents of the doctor_data DataFrame
doctor_data = doctor_data.head(len(doctor_data)-6)
# Extract the state name from the 'Area Name' column
doctor_data['state'] = doctor_data['Area Name'].str.split("(").str.get(0)
# Print the column names after adding the 'state' column (optional)
print(doctor_data.columns)
# Rename the 'Employment(1)' column for clarity
doctor_data['number of doctors'] = doctor_data['Employment(1)']
# Select only the 'state' and 'number of doctors' columns
doctor_data = doctor_data[['state', 'number of doctors']]
# Print the DataFrame dimensions (number of rows and columns)
print(doctor_data.shape)
# Extract unique state names from the 'state' column
doctor_state_names = sorted(list(set(doctor_data['state'])))
# Print the number of unique state names
print(len(doctor_state_names))
# Extract column names (excluding the first column) as strings
column_names_string = [str(column) for column in population_data.columns[1:]]
# Extract state names from column names (assuming they're separated by "!!")
ped_data_state_names = [column.split("!!")[0] for column in column_names_string]
# Find unique state names (remove duplicates) and sort them alphabetically (optional)
unique_state_names = sorted(list(set(ped_data_state_names)))
# Calculate the number of unique state names
num_unique_state_names = len(unique_state_names)
# Calculate the number of data columns (excluding the first column)
num_columns = len(population_data.columns[1:])
# Calculate the average number of columns per state
average_columns_per_state = num_columns / num_unique_state_names
# Print the average number of columns per state
print(average_columns_per_state)
# Define the keep list containing indices to include (second and every 12th column)
keep_list = [i for i in range(1, len(population_data.columns), 12)]
#add_to_list = [str(population_data.columns[0])]
# Create a new sorted list containing elements from the keep list
#wanted_columns =add_to_list + sorted([population_data.columns[i] for i in keep_list])
wanted_columns = sorted([population_data.columns[i] for i in keep_list])
# row 23 is the <18 yo people in the respective state/ teritory
scrubbed_population_ds = population_data[wanted_columns].iloc[23,:]
scrubbed_population_ds = scrubbed_population_ds.rename("pediatric population estimate")
scrubbed_population_ds.index = scrubbed_population_ds.index.str.split("!!").str.get(0)
scrubbed_population_df = scrubbed_population_ds.to_frame(name=scrubbed_population_ds.index.name)
new_column_names = {"index": "state", scrubbed_population_df.columns[0]: "pediatric population estimate"}
scrubbed_population_df = scrubbed_population_df.rename(columns=new_column_names)
missing_states = [state for state in unique_state_names if state not in doctor_state_names]
missing_states #states that are missing and will need to be removed from
joined_data = pd.merge(scrubbed_population_df, doctor_data, left_index=True, right_on='state', how='inner')
joined_data = joined_data[['state','number of doctors','pediatric population estimate']]
joined_data = pd.merge(scrubbed_population_df, doctor_data, left_index=True, right_on='state', how='left')
joined_data['state'].fillna('Unknown', inplace=True) # Replace NaNs with 'Unknown'
joined_data = joined_data.reset_index() # Modifies the DataFrame itself
joined_data = joined_data[['state', 'pediatric population estimate', 'number of doctors']]
print(joined_data)
#cleaned_docs = joined_data.dropna(subset=['number of doctors'])
num_docs_list = joined_data['number of doctors'].to_list()
new_num_docs_list = pd.to_numeric(num_docs_list, errors='coerce')
new_num_docs_list
#cleaned_patients = joined_data.dropna(subset=['number of doctors'])
num_patients_list = joined_data['pediatric population estimate'].to_list()
#new_num_docs_list = pd.to_numeric(num_docs_list, errors='coerce')
num_patients_list
clean_num_pat_list = []
for number in num_patients_list:
solid_number = number.replace(",", "")
clean_num_pat_list.append(int(solid_number))
clean_num_pat_list
state_names = joined_data['state']
cleaned_data = pd.DataFrame({
"State": state_names,
"Number of Doctors": new_num_docs_list,
"Patient Estimate": clean_num_pat_list
})
cleaned_data['Doc to Patient Ratio'] = cleaned_data['Number of Doctors']/cleaned_data['Patient Estimate']
cleaned_data['Patient to doc Ratio'] = cleaned_data['Patient Estimate']/cleaned_data['Number of Doctors']
cleaned_data
#Setting the data path
data_path="/home/jovyan/shared_data/data/geog407/lab3/"
#Identifying the spatial data to be used
state_geo = f"{data_path}/us-states.json"
#Create a Folium Map Object identifying the central latitude, longitude and starting Zoom
m = folium.Map(location=[48, -102], zoom_start=3)
# Call the Choropleth function
# state_geo variable is identified as geographic data
# UnemploymentRateJuly2021 variable is identified as tabular data
# Columns to be used in data and what to key on are identified
folium.Choropleth(
geo_data=state_geo, #setting spatial data
name="Patient to Doctor Ratio",
data=cleaned_data, #setting tabular data
columns=["State", "Patient to doc Ratio"],
key_on="feature.properties.name",
fill_color="YlGnBu", #setting fill color
fill_opacity=0.7, # setting opacity
line_opacity=0.2,
legend_name="Patient to Doctor Ratio", #setting legend
).add_to(m)
#Adds Layer control
folium.LayerControl().add_to(m)
# Finally show the map
m