import json
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from scipy import stats
from io import StringIO, BytesIO
from bs4 import BeautifulSoup
from pprint import pprint
from scipy.stats import linregress
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import warnings

warnings.filterwarnings("ignore")

# Fetching the cargo fraud data that has been uploaded to our GitHub Repository
cargo_fraud = pd.read_csv("https://raw.githubusercontent.com/ryantangmj/ryantangmj.github.io/refs/heads/main/cargo_fraud_only.csv", index_col=0)
cargo_fraud = cargo_fraud.reset_index()
cargo_fraud.head()

# Filtering out fraud data only
cargo_fraud = cargo_fraud[cargo_fraud["offense_name"].isin(["Credit Card/Automated Teller Machine Fraud", 
                                              "Wire Fraud", 
                                              "Welfare Fraud"])]
cargo_fraud.head()

# Calculate the proportion of fraud by region
fraud_by_state = cargo_fraud['region_name'].value_counts(normalize=True).sort_values(ascending=False)

# Proportion of Fraud by Region with Enhanced Visualization
plt.figure(figsize=(14, 8))
sns.barplot(x=fraud_by_state.values, y=fraud_by_state.index, palette="coolwarm", edgecolor="black")
plt.title("Proportion of Fraud Occurrences by Region", fontsize=20, weight="bold", pad=20)
plt.xlabel("Proportion of Fraud Occurrences", fontsize=14, weight="bold")
plt.ylabel("Region", fontsize=14, weight="bold")
plt.grid(axis='x', linestyle='--', linewidth=0.5, alpha=0.7)
plt.tight_layout()
plt.show()

# Calculate the proportion of fraud by state
fraud_by_state = cargo_fraud['state_name'].value_counts(normalize=True).sort_values(ascending=False)

# Retrieving geopandas usa data
usa = gpd.read_file('https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip')

# Merge the geopandas file and fraud data
merged = usa.merge(fraud_by_state, how='left', left_on='NAME', right_on='state_name')
merged['proportion'] = merged['proportion'].fillna(0)

# Exclude Alaska and Hawaii before reprojection
merged = merged[~merged['STUSPS'].isin(['AK', 'HI'])]

# Reproject the GeoDataFrame
visframe = merged.to_crs(epsg=2163)

# Enhanced Map Visualization for Fraud by State
fig, ax = plt.subplots(1, 1, figsize=(18, 12))
visframe.plot(
    column='proportion', cmap='coolwarm', linewidth=0.6,
    ax=ax, edgecolor='black', legend=True,
    legend_kwds={
        'label': "Proportion of Fraud Occurrences by State",
        'orientation': "horizontal",
        'shrink': 0.8
    }
)

for idx, row in visframe.iterrows():
    ax.annotate(text=f"{row['STUSPS']}\n{row['proportion']:.3f}", xy=row['geometry'].centroid.coords[0],
                horizontalalignment='center', fontsize=6, color='black')
    
ax.set_xlim([-2.5e6, 2.5e6])
ax.set_ylim([-2.5e6, 1.5e6])
ax.set_axis_off()
ax.set_title('Proportion of Fraud Occurrences by State', fontsize=20, weight="bold", pad=20)
plt.tight_layout()
plt.show()

# Ensure data_cargo_fraud is your dataset and 'offender_race' is the correct column name
fraud_by_race = cargo_fraud['offender_race'].value_counts(normalize=True).sort_values(ascending=False)

# Enhanced Bar Chart for Fraud Occurrences by Race
plt.figure(figsize=(14, 8))
sns.barplot(
    x=fraud_by_race.values, 
    y=fraud_by_race.index, 
    palette="viridis", edgecolor="black"
)

plt.title("Proportion of Fraud Occurrences by Race", fontsize=20, weight="bold", pad=20, color="#333333")
plt.xlabel("Proportion of Fraud Occurrences", fontsize=16, weight="bold", labelpad=10, color="#555555")
plt.ylabel("Race", fontsize=16, weight="bold", labelpad=10, color="#555555")
plt.xticks(fontsize=12, color="#333333")
plt.yticks(fontsize=12, color="#333333")
plt.grid(axis='x', color='gray', linestyle='--', linewidth=0.5, alpha=0.7)
plt.tight_layout()
plt.show()

# Count the occurrences of fraud by victim type
fraud_by_type = cargo_fraud['victim_type_name'].value_counts(normalize=True).sort_values(ascending=False)

# Enhanced Bar Plot for Fraudulent Transactions by Victim Type
plt.figure(figsize=(14, 8))
sns.barplot(
    x=fraud_by_type.values, 
    y=fraud_by_type.index, 
    palette="Spectral", edgecolor="black"
)

plt.title("Proportion of Fraudulent Transactions by Victim Type", fontsize=20, weight="bold", pad=20, color="#333333")
plt.xlabel("Proportion of Fraudulent Transactions", fontsize=16, weight="bold", labelpad=10, color="#555555")
plt.ylabel("Victim Type", fontsize=16, weight="bold", labelpad=10, color="#555555")
plt.xticks(fontsize=12, color="#333333")
plt.yticks(fontsize=12, color="#333333")
plt.grid(axis='x', color='gray', linestyle='--', linewidth=0.5, alpha=0.7)
plt.tight_layout()
plt.show()

# Count the occurrences of fraud by year
fraud_by_year = cargo_fraud['data_year'].value_counts().sort_index()

# Enhanced Line Plot for Fraudulent Transactions Over Time
plt.figure(figsize=(14, 8))
sns.set_theme(style="whitegrid")  # Clean theme
sns.lineplot(
    x=fraud_by_year.index, 
    y=fraud_by_year.values, 
    marker="o", color="#5A9BD5", linewidth=2.5
)

plt.title("Fraudulent Transactions by Year", fontsize=20, weight="bold", pad=20, color="#333333")
plt.xlabel("Year", fontsize=16, weight="bold", labelpad=10, color="#555555")
plt.ylabel("Number of Fraudulent Transactions", fontsize=16, weight="bold", labelpad=10, color="#555555")
plt.xticks(fontsize=12, color="#333333")
plt.yticks(fontsize=12, color="#333333")
plt.grid(color='gray', linestyle='--', linewidth=0.5, alpha=0.7)
plt.tight_layout()
plt.show()

# Mapping of state abbreviations to full state names in lowercase
state_abbrev_to_name = {
    'al': 'alabama', 'ak': 'alaska', 'az': 'arizona', 'ar': 'arkansas',
    'ca': 'california', 'co': 'colorado', 'ct': 'connecticut', 'de': 'delaware',
    'fl': 'florida', 'ga': 'georgia', 'hi': 'hawaii', 'id': 'idaho',
    'il': 'illinois', 'in': 'indiana', 'ia': 'iowa', 'ks': 'kansas',
    'ky': 'kentucky', 'la': 'louisiana', 'me': 'maine', 'md': 'maryland',
    'ma': 'massachusetts', 'mi': 'michigan', 'mn': 'minnesota', 'ms': 'mississippi',
    'mo': 'missouri', 'mt': 'montana', 'ne': 'nebraska', 'nv': 'nevada',
    'nh': 'new hampshire', 'nj': 'new jersey', 'nm': 'new mexico', 'ny': 'new york',
    'nc': 'north carolina', 'nd': 'north dakota', 'oh': 'ohio', 'ok': 'oklahoma',
    'or': 'oregon', 'pa': 'pennsylvania', 'ri': 'rhode island', 'sc': 'south carolina',
    'sd': 'south dakota', 'tn': 'tennessee', 'tx': 'texas', 'ut': 'utah',
    'vt': 'vermont', 'va': 'virginia', 'wa': 'washington', 'wv': 'west virginia',
    'wi': 'wisconsin', 'wy': 'wyoming', 'dc': 'district of columbia',
    'as': 'american samoa', 'gu': 'guam', 'mp': 'northern mariana islands',
    'pr': 'puerto rico', 'vi': 'virgin islands', 'us': 'united states'
}

# Define territories and non-continental states to exclude
exclude_states = [
    'puerto rico', 'guam', 'virgin islands', 'american samoa',
    'northern mariana islands', 'alaska', 'hawaii', 'united states'
]

exclude_states_lower = [state.lower() for state in exclude_states]

# Load the cargo fraud data
cargo_fraud = pd.read_csv('cargo_fraud_only.csv')

# Check if 'data_year' column exists
if 'data_year' in cargo_fraud.columns:
    # Use 'data_year' as 'Year'
    cargo_fraud['Year'] = cargo_fraud['data_year'].astype(int)
else:
    # Extract 'Year' from 'date_recovered' or another date column
    cargo_fraud['date_recovered'] = pd.to_datetime(cargo_fraud['date_recovered'], errors='coerce')
    cargo_fraud['Year'] = cargo_fraud['date_recovered'].dt.year
    cargo_fraud = cargo_fraud.dropna(subset=['Year'])
    cargo_fraud['Year'] = cargo_fraud['Year'].astype(int)

# Ensure the 'State' column exists and matches the 'State' in other datasets
if 'state_name' in cargo_fraud.columns:
    cargo_fraud['State'] = cargo_fraud['state_name'].str.lower().str.strip()
elif 'state_abbr' in cargo_fraud.columns:
    # Map state abbreviations to full state names
    cargo_fraud['State'] = cargo_fraud['state_abbr'].str.lower().map(state_abbrev_to_name)
else:
    raise KeyError("No 'state_name' or 'state_abbr' column found in cargo_fraud DataFrame.")

# Exclude rows with missing 'State' or 'Year'
cargo_fraud = cargo_fraud.dropna(subset=['State', 'Year'])

# Exclude territories and non-continental states
cargo_fraud = cargo_fraud[~cargo_fraud['State'].isin(exclude_states_lower)]

# Display the processed cargo fraud data
print("\nProcessed Cargo Fraud Data:")
display(cargo_fraud[['State', 'Year']])

print(cargo_fraud.columns)

Processed Cargo Fraud Data:

Index(['data_year', 'ori', 'pub_agency_name', 'pub_agency_unit',
       'agency_type_name', 'state_abbr', 'state_name', 'division_name',
       'county_name', 'region_name', 'population_group_code',
       'population_group_desc', 'offense_code', 'offense_name',
       'offender_race', 'offender_ethnicity', 'offender_age', 'offender_sex',
       'victim_type_code', 'victim_type_name', 'location_code',
       'location_name', 'weapon_code', 'weapon_name', 'prop_desc_code',
       'prop_desc_code.1', 'prop_desc_name', 'stolen_value', 'recovered_value',
       'recovered_flag', 'date_recovered', 'Year', 'State'],
      dtype='object')

# Load HPI data
hpi_data_url = "https://raw.githubusercontent.com/ryantangmj/ryantangmj.github.io/main/hpi_by_state.csv"
hpi_data = pd.read_csv(hpi_data_url)

# Keep relevant columns
hpi_data = hpi_data[["State", "Year", "HPI"]]

# Filter years between 2012 and 2022
hpi_data = hpi_data[(hpi_data["Year"] >= 2012) & (hpi_data["Year"] <= 2022)].reset_index(drop=True)

# Standardize 'State' names
hpi_data['State'] = hpi_data['State'].str.lower().str.strip()

# Map state abbreviations to full names if necessary
if hpi_data['State'].str.len().max() == 2:
    hpi_data['State'] = hpi_data['State'].map(state_abbrev_to_name)

# Exclude territories and non-continental states
hpi_data = hpi_data[~hpi_data['State'].isin(exclude_states_lower)]

# Convert 'Year' to integer
hpi_data['Year'] = hpi_data['Year'].astype(int)

# Display the first few rows
print("\nProcessed HPI Data:")
display(hpi_data)

Processed HPI Data:

# Group the HPI data by year
hpi_by_year = hpi_data.groupby("Year").agg(HPI=('HPI', 'mean')
).reset_index()

# Plot HPI over time
plt.figure(figsize=(10, 6))
plt.plot(hpi_by_year.Year, hpi_by_year.HPI, marker='o', linestyle='-')
plt.title('Housing Price Index by Year')
plt.xlabel('Year')
plt.ylabel('Housing Price Index')
plt.grid(True)
plt.tight_layout()
plt.show()

hpi_by_state = hpi_data.groupby("State").agg(HPI=('HPI', 'median')
).reset_index()

# retrieving geopandas usa data
usa = gpd.read_file('https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip')
usa['NAME_lower'] = usa['NAME'].str.lower()

# merge the geopandas file and unemployment data
merged = usa.merge(hpi_by_state, how='left', left_on='NAME_lower', right_on='State')

# exclude Alaska and Hawaii before reprojection
merged = merged[~merged['STUSPS'].isin(['AK', 'HI'])]

# reproject the GeoDataFrame
visframe = merged.to_crs(epsg=2163)

# Enhanced HPI Map
fig, ax = plt.subplots(1, 1, figsize=(20, 12))
visframe.plot(column='HPI', cmap='coolwarm', linewidth=0.8, ax=ax, edgecolor='0.5',
              legend=True, legend_kwds={'label': "Median HPI (2012-2022)", 'orientation': "horizontal"})
for idx, row in visframe.iterrows():
    ax.annotate(text=f"{row['STUSPS']}\n{row['HPI']:.1f}", xy=row['geometry'].centroid.coords[0],
                horizontalalignment='center', fontsize=8, color='black')
ax.set_title('Median Housing Price Index by State (2012-2022)', fontsize=20, pad=20, weight='bold')
ax.set_axis_off()
plt.tight_layout()
plt.show()

# Load poverty data
poverty_data = pd.read_csv('poverty_data.csv')  # Replace with the actual file path

# Reshape poverty data to long format
id_vars = ['State']
value_vars = [col for col in poverty_data.columns if col != 'State']

poverty_long = pd.melt(poverty_data, id_vars=id_vars, value_vars=value_vars,
                       var_name='Variable', value_name='Value')

# Extract 'Year' from the 'Variable' column
poverty_long['Year'] = poverty_long['Variable'].str.extract('(\d{4})', expand=False).astype(int)
poverty_long['Variable_Name'] = poverty_long['Variable'].str.replace(' \d{4}', '', regex=True).str.strip()

# Pivot the data to have one row per 'State' and 'Year'
poverty_pivot = poverty_long.pivot_table(index=['State', 'Year'], columns='Variable_Name', values='Value', aggfunc='first').reset_index()
poverty_pivot.columns.name = None

# Standardize 'State' names
poverty_pivot['State'] = poverty_pivot['State'].str.lower().str.strip()

# Convert numeric columns to float
numeric_cols = ['Total population', 'Number in poverty', 'Percentage poverty']
for col in numeric_cols:
    poverty_pivot[col] = poverty_pivot[col].astype(str).replace('nan', '').str.replace(',', '').str.replace('%', '')
    poverty_pivot[col] = poverty_pivot[col].replace('', pd.NA)
    poverty_pivot[col] = poverty_pivot[col].astype(float)

# Rename columns for clarity
poverty_pivot.rename(columns={
    'Total population': 'Total_Population',
    'Number in poverty': 'Number_in_Poverty',
    'Percentage poverty': 'Poverty_Rate'
}, inplace=True)

# Exclude territories and non-continental states
poverty_pivot = poverty_pivot[~poverty_pivot['State'].isin(exclude_states_lower)]

# Display the first few rows
print("\nProcessed Poverty Data:")
display(poverty_pivot)

Processed Poverty Data:

df_poverty_by_state = poverty_pivot.groupby("State").agg(percentage_poverty=('Poverty_Rate', 'median')
).reset_index()

# Retrieving geopandas usa data
usa = gpd.read_file('https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip')
usa['NAME_lower'] = usa['NAME'].str.lower()

# Merge the geopandas file and unemployment data
merged = usa.merge(df_poverty_by_state, how='left', left_on='NAME_lower', right_on='State')

# Exclude Alaska and Hawaii before reprojection
merged = merged[~merged['STUSPS'].isin(['AK', 'HI', 'GU', 'MP', 'PR', 'VI'])]

# Reproject the GeoDataFrame
visframe = merged.to_crs(epsg=2163)

# Enhanced Poverty Map
fig, ax = plt.subplots(1, 1, figsize=(20, 12))
visframe.plot(column='percentage_poverty', cmap='coolwarm', linewidth=0.8, ax=ax, edgecolor='0.5',
              legend=True, legend_kwds={'label': "Median Percentage Poverty (2007-2022)", 'orientation': "horizontal"})
for idx, row in visframe.iterrows():
    ax.annotate(text=f"{row['STUSPS']}\n{row['percentage_poverty']:.1f}", xy=row['geometry'].centroid.coords[0],
                horizontalalignment='center', fontsize=8, color='black')
ax.set_title('Median Percentage Poverty by State (2007-2022)', fontsize=20, pad=20, weight='bold')
ax.set_axis_off()
plt.tight_layout()
plt.show()

# Load homelessness data
homelessness_data_url = "https://raw.githubusercontent.com/ryantangmj/ryantangmj.github.io/main/homeless_data.csv"
homelessness_data = pd.read_csv(homelessness_data_url)

# Map state abbreviations to full state names
homelessness_data['State'] = homelessness_data['State'].str.lower().map(state_abbrev_to_name)

# Exclude rows with missing 'State'
homelessness_data = homelessness_data.dropna(subset=['State'])

# Exclude territories and non-continental states
homelessness_data = homelessness_data[~homelessness_data['State'].isin(exclude_states_lower)]

# Rename columns for years (Assuming columns have year information)
new_column_names = {
    col: col.split(', ')[-1].split('-')[0] for col in homelessness_data.columns if 'Change in Total Homelessness' in col
}
homelessness_data.rename(columns=new_column_names, inplace=True)

# Melt the DataFrame to long format
df_homelessness = pd.melt(homelessness_data, id_vars=['State'], var_name='Year', value_name='homeless_rate_change')

# Clean 'homeless_rate_change' column
df_homelessness['homeless_rate_change'] = df_homelessness['homeless_rate_change'].replace(' ', np.nan)
df_homelessness = df_homelessness.dropna(subset=['homeless_rate_change'])
df_homelessness['homeless_rate_change'] = df_homelessness['homeless_rate_change'].str.replace('%', '').astype(float)
df_homelessness['Year'] = df_homelessness['Year'].astype(int)

# Display the first few rows
print("\nProcessed Homelessness Data:")
display(df_homelessness)

Processed Homelessness Data:

df_homelessness_by_state = df_homelessness.groupby("State").agg(homeless_rate_change=('homeless_rate_change', 'median')
).reset_index()

# retrieving geopandas usa data
usa = gpd.read_file('https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip')
usa['NAME_lower'] = usa['NAME'].str.lower()

# merge the geopandas file and unemployment data
merged = usa.merge(df_homelessness_by_state, how='left', left_on='NAME_lower', right_on='State')

# exclude Alaska and Hawaii before reprojection
merged = merged[~merged['STUSPS'].isin(['AK', 'HI', 'GU', 'MP', 'PR', 'VI'])]

# reproject the GeoDataFrame
visframe = merged.to_crs(epsg=2163)

# Enhanced HPI Map
fig, ax = plt.subplots(1, 1, figsize=(20, 12))
visframe.plot(column='homeless_rate_change', cmap='coolwarm', linewidth=0.8, ax=ax, edgecolor='0.5',
              legend=True, legend_kwds={'label': "Median Homeless Rate Change (2007-2022)", 'orientation': "horizontal"})
for idx, row in visframe.iterrows():
    ax.annotate(text=f"{row['STUSPS']}\n{row['homeless_rate_change']:.1f}", xy=row['geometry'].centroid.coords[0],
                horizontalalignment='center', fontsize=8, color='black')
ax.set_title('Median Homeless Rate Change by State (2007-2022)', fontsize=20, pad=20, weight='bold')
ax.set_axis_off()
plt.tight_layout()
plt.show()

# Load education data
education_data = pd.read_csv('education.csv')  # Replace with the actual file path

# Display the first few rows to verify the structure
print("\nInitial Education Data:")
display(education_data)

# Identify columns related to education metrics for years 2012-2022
education_cols = [col for col in education_data.columns if any(str(year) in col for year in range(2012, 2023))]

# Keep 'State' and the identified education columns
education_data = education_data[['State'] + education_cols]

# Melt the data to long format
education_long = pd.melt(
    education_data,
    id_vars=['State'],
    value_vars=education_cols,
    var_name='Variable',
    value_name='Value'
)

# Extract 'Year' and 'Education_Variable' from the 'Variable' column
education_long['Year'] = education_long['Variable'].str.extract('(\d{4})', expand=False).astype(int)
education_long['Education_Variable'] = education_long['Variable'].str.replace(' \d{4}', '', regex=True).str.strip()

# Drop rows where 'Year' is NaN
education_long = education_long.dropna(subset=['Year'])

# Pivot the data to have one row per 'State' and 'Year'
education_pivot = education_long.pivot_table(
    index=['State', 'Year'],
    columns='Education_Variable',
    values='Value',
    aggfunc='first'
).reset_index()

# Flatten the columns
education_pivot.columns.name = None

# Standardize 'State' names
education_pivot['State'] = education_pivot['State'].str.lower().str.strip()

# Map state abbreviations to full state names if necessary
if education_pivot['State'].str.len().max() == 2:
    education_pivot['State'] = education_pivot['State'].map(state_abbrev_to_name)

# Exclude territories and non-continental states
education_pivot = education_pivot[~education_pivot['State'].isin(exclude_states_lower)]

# Convert numeric columns to float
numeric_cols = [col for col in education_pivot.columns if col not in ['State', 'Year']]
for col in numeric_cols:
    # Convert to string and remove commas and percent signs
    education_pivot[col] = education_pivot[col].astype(str).str.replace(',', '').str.replace('%', '')
    # Convert to numeric, coercing errors to NaN
    education_pivot[col] = pd.to_numeric(education_pivot[col], errors='coerce')

# Rename columns for clarity (Adjust based on actual column names)
# Example:
# education_pivot.rename(columns={
#     "Percentage with Bachelor's Degree": 'Bachelor_Degree_Rate'
# }, inplace=True)

# Exclude rows with missing 'State' or 'Year' after mapping
education_pivot = education_pivot.dropna(subset=['State', 'Year'])

# Display the first few rows of the processed education data
print("\nProcessed Education Data (2012-2022):")
display(education_pivot)

Initial Education Data:

Processed Education Data (2012-2022):

# Load the unemployment data
unemployment_data = pd.read_csv('Unemployment.csv')  # Replace with the actual file path

# Identify columns that contain 'Unemployment_rate'
unemployment_rate_cols = [col for col in unemployment_data.columns if 'Unemployment_rate' in col]

# Melt the DataFrame to long format
unemployment_long = pd.melt(
    unemployment_data,
    id_vars=['State', 'Area_Name'],
    value_vars=unemployment_rate_cols,
    var_name='Year',
    value_name='Unemployment_Rate'
)

# Extract the year from the 'Year' column
unemployment_long['Year'] = unemployment_long['Year'].str.extract('Unemployment_rate_(\d+)', expand=False).astype(int)

# Convert 'Unemployment_Rate' to numeric
unemployment_long['Unemployment_Rate'] = pd.to_numeric(unemployment_long['Unemployment_Rate'], errors='coerce')

# Standardize 'State' names
unemployment_long['State'] = unemployment_long['State'].str.lower().str.strip()

# Map state abbreviations to full state names if necessary
if unemployment_long['State'].str.len().max() == 2:
    unemployment_long['State'] = unemployment_long['State'].map(state_abbrev_to_name)

# Exclude territories and non-continental states
unemployment_long = unemployment_long[~unemployment_long['State'].isin(exclude_states_lower)]

# Aggregate at the state level by taking the mean unemployment rate for each state and year
state_unemployment = unemployment_long.groupby(['State', 'Year'])['Unemployment_Rate'].mean().reset_index()

# Display the processed unemployment data
print("\nProcessed Unemployment Data:")
print(state_unemployment.head())

Processed Unemployment Data:
     State  Year  Unemployment_Rate
0  alabama  2000           5.557353
1  alabama  2001           6.314706
2  alabama  2002           7.030882
3  alabama  2003           7.017647
4  alabama  2004           6.536765

     State  Year  Unemployment_Rate
0  alabama  2000           5.557353
1  alabama  2001           6.314706
2  alabama  2002           7.030882
3  alabama  2003           7.017647
4  alabama  2004           6.536765

# Aggregate the number of fraud cases per state and year
fraud_counts = cargo_fraud.groupby(['State', 'Year']).size().reset_index(name='Fraud_Count')

# Ensure 'State' and 'Year' are of appropriate types
fraud_counts['State'] = fraud_counts['State'].str.lower().str.strip()
fraud_counts['Year'] = fraud_counts['Year'].astype(int)

# Define a function to prepare dataframes for merging
def prepare_dataframe(df, state_col='State', year_col='Year'):
    df[state_col] = df[state_col].str.lower().str.strip()
    df[year_col] = df[year_col].astype(int)
    return df

# Prepare all datasets
fraud_counts = prepare_dataframe(fraud_counts)
poverty_pivot = prepare_dataframe(poverty_pivot)
education_pivot = prepare_dataframe(education_pivot)
hpi_data = prepare_dataframe(hpi_data)
state_unemployment = prepare_dataframe(state_unemployment)

# Merge fraud_counts with poverty_pivot
merged_data = pd.merge(fraud_counts, poverty_pivot, on=['State', 'Year'], how='left')
print("\nAfter merging with poverty_pivot:", merged_data.shape)
display(merged_data)

# Merge with education_pivot
merged_data = pd.merge(merged_data, education_pivot, on=['State', 'Year'], how='left')
print("After merging with education_pivot:", merged_data.shape)
display(merged_data)

# Merge with hpi_data
merged_data = pd.merge(merged_data, hpi_data, on=['State', 'Year'], how='left')
print("After merging with hpi_data:", merged_data.shape)
display(merged_data)

# Merge with state_unemployment
merged_data = pd.merge(merged_data, state_unemployment, on=['State', 'Year'], how='left')
print("After merging with state_unemployment:", merged_data.shape)
display(merged_data)

After merging with poverty_pivot: (338, 6)

After merging with education_pivot: (338, 16)

After merging with hpi_data: (338, 17)

After merging with state_unemployment: (338, 18)

# Display the count of missing values in each column
print("\nMissing values in merged_data:")
print(merged_data.isnull().sum())

# Check if merged_data is empty
if merged_data.empty:
    print("Error: 'merged_data' is empty. Please check the merging steps.")
else:
    # Define critical columns for analysis
    critical_columns = ['Fraud_Count', 'Poverty_Rate', 'HPI', 'Unemployment_Rate']

    # Initialize imputers
    mean_imputer = SimpleImputer(strategy='mean')
    median_imputer = SimpleImputer(strategy='median')

    # Impute 'Poverty_Rate' and 'HPI' with mean
    for col in ['Poverty_Rate', 'HPI']:
        if col in merged_data.columns:
            merged_data[col] = mean_imputer.fit_transform(merged_data[[col]])
            print(f"Imputed missing values in '{col}' with mean.")
        else:
            print(f"Warning: '{col}' column not found in merged_data.")

    # Impute 'Unemployment_Rate' with median
    col = 'Unemployment_Rate'
    if col in merged_data.columns:
        merged_data[col] = median_imputer.fit_transform(merged_data[[col]])
        print(f"Imputed missing values in '{col}' with median.")
    else:
        print(f"Warning: '{col}' column not found in merged_data.")

    # Identify columns with more than 50% missing values
    threshold = 0.5
    missing_percent = merged_data.isnull().mean()
    columns_to_drop = missing_percent[missing_percent > threshold].index.tolist()

    print("\nColumns to drop due to high missingness (>50% missing):")
    print(columns_to_drop)

    # Drop these columns
    merged_data_clean = merged_data.drop(columns=columns_to_drop)
    print("\nDropped columns with high missingness.")

    # Verify the number of rows after dropping
    print(f"\nNumber of rows before dropping high missingness columns: {merged_data.shape[0]}")
    print(f"Number of rows after dropping high missingness columns: {merged_data_clean.shape[0]}")

    # Final check for missing values
    print("\nMissing values in merged_data_clean after handling:")
    print(merged_data_clean.isnull().sum())

    # Optional: Impute remaining low missingness columns if necessary
    # In your case, 'Number_in_Poverty' and 'Total_Population' have 5 missing values each
    remaining_imputer = SimpleImputer(strategy='mean')
    for col in ['Number_in_Poverty', 'Total_Population']:
        if col in merged_data_clean.columns:
            merged_data_clean[col] = remaining_imputer.fit_transform(merged_data_clean[[col]])
            print(f"Imputed missing values in '{col}' with mean.")
        else:
            print(f"Warning: '{col}' column not found in merged_data_clean.")

    # Final check for missing values after imputation
    print("\nMissing values in merged_data_clean after all imputations:")
    print(merged_data_clean.isnull().sum())

Missing values in merged_data:
State                                                                    0
Year                                                                     0
Fraud_Count                                                              0
Number_in_Poverty                                                        5
Poverty_Rate                                                             5
Total_Population                                                         5
2013 Rural-urban Continuum Code                                        324
2013 Urban Influence Code                                              324
Bachelor's degree or higher,-22                                        303
High school diploma only,-22                                           303
Less than a high school diploma,-22                                    303
Percent of adults completing some college or associate's degree,-22    303
Percent of adults with a bachelor's degree or higher,-22               303
Percent of adults with a high school diploma only,-22                  303
Percent of adults with less than a high school diploma,-22             303
Some college or associate's degree,-22                                 303
HPI                                                                      5
Unemployment_Rate                                                        5
dtype: int64
Imputed missing values in 'Poverty_Rate' with mean.
Imputed missing values in 'HPI' with mean.
Imputed missing values in 'Unemployment_Rate' with median.

Columns to drop due to high missingness (>50% missing):
['2013 Rural-urban Continuum Code', '2013 Urban Influence Code', "Bachelor's degree or higher,-22", 'High school diploma only,-22', 'Less than a high school diploma,-22', "Percent of adults completing some college or associate's degree,-22", "Percent of adults with a bachelor's degree or higher,-22", 'Percent of adults with a high school diploma only,-22', 'Percent of adults with less than a high school diploma,-22', "Some college or associate's degree,-22"]

Dropped columns with high missingness.

Number of rows before dropping high missingness columns: 338
Number of rows after dropping high missingness columns: 338

Missing values in merged_data_clean after handling:
State                0
Year                 0
Fraud_Count          0
Number_in_Poverty    5
Poverty_Rate         0
Total_Population     5
HPI                  0
Unemployment_Rate    0
dtype: int64
Imputed missing values in 'Number_in_Poverty' with mean.
Imputed missing values in 'Total_Population' with mean.

Missing values in merged_data_clean after all imputations:
State                0
Year                 0
Fraud_Count          0
Number_in_Poverty    0
Poverty_Rate         0
Total_Population     0
HPI                  0
Unemployment_Rate    0
dtype: int64

features = [col for col in merged_data_clean.columns if col not in ['State', 'Year', 'Fraud_Count']]

# Compute correlation matrix
corr_matrix = merged_data_clean[['Fraud_Count'] + features].corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

# Define the target variable
target = 'Fraud_Count'

# Define features: all columns except 'State', 'Year', and 'Fraud_Count'
features = [col for col in merged_data_clean.columns if col not in ['State', 'Year', 'Fraud_Count']]

# Display selected features
print("\nSelected Features for Analysis:")
print(features)

# Extract feature matrix (X) and target vector (y)
X = merged_data_clean[features]
y = merged_data_clean[target]

# Check the number of samples and features
print(f"\nNumber of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Selected Features for Analysis:
['Number_in_Poverty', 'Poverty_Rate', 'Total_Population', 'HPI', 'Unemployment_Rate']

Number of samples: 338
Number of features: 5

Training set size: 270 samples
Testing set size: 68 samples

# Initialize the Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)

# Train the model on the training data
dt_model.fit(X_train, y_train)

print("\nDecision Tree Regressor has been trained.")

Decision Tree Regressor has been trained.

# Make predictions on the test set
y_pred = dt_model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

print("\nDecision Tree Regressor Performance:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared: {r2:.4f}")

Decision Tree Regressor Performance:
Mean Squared Error (MSE): 7135722.66
R-squared: -0.1804

# Get feature importances from the model
importances = dt_model.feature_importances_

# Create a DataFrame for feature importances
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Display feature importances
print("\nFeature Importances:")
print(feature_importance_df)

# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(data=feature_importance_df, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importances from Decision Tree Regressor', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.tight_layout()
plt.show()

Feature Importances:
             Feature  Importance
4  Unemployment_Rate    0.392510
0  Number_in_Poverty    0.258580
2   Total_Population    0.160291
3                HPI    0.102416
1       Poverty_Rate    0.086203

# Initialize Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)
print("\nRandom Forest Regressor has been trained successfully!")

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Calculate Mean Squared Error (MSE) and R-squared
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("\nRandom Forest Regressor Performance:")
print(f"Mean Squared Error (MSE): {mse_rf:.2f}")
print(f"R-squared: {r2_rf:.4f}")

# Extract feature importances from the Random Forest model
importances_rf = rf_model.feature_importances_
feature_importance_rf = pd.DataFrame({
    'Feature': features,
    'Importance': importances_rf
}).sort_values(by='Importance', ascending=False)

# Display feature importances
print("\nRandom Forest Feature Importances:")
print(feature_importance_rf)

# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(data=feature_importance_rf, x='Importance', y='Feature', palette='magma')
plt.title('Feature Importances from Random Forest Regressor', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.tight_layout()
plt.show()

Random Forest Regressor has been trained successfully!

Random Forest Regressor Performance:
Mean Squared Error (MSE): 5022413.27
R-squared: 0.1692

Random Forest Feature Importances:
             Feature  Importance
2   Total_Population    0.350062
0  Number_in_Poverty    0.254283
4  Unemployment_Rate    0.165945
3                HPI    0.119327
1       Poverty_Rate    0.110383

# Define parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search_rf = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid_rf,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

# Fit GridSearchCV
print("\nStarting Grid Search for Random Forest...")
grid_search_rf.fit(X_train, y_train)

# Best parameters
print("\nBest parameters found for Random Forest:")
print(grid_search_rf.best_params_)

# Best estimator
best_rf_model = grid_search_rf.best_estimator_

# Predict with the best model
y_pred_best_rf = best_rf_model.predict(X_test)

# Evaluate
mse_best_rf = mean_squared_error(y_test, y_pred_best_rf)
r2_best_rf = r2_score(y_test, y_pred_best_rf)

print(f"\nBest Random Forest Regressor Performance:")
print(f"Mean Squared Error (MSE): {mse_best_rf:.2f}")
print(f"R-squared: {r2_best_rf:.4f}")

# Feature Importances from the best Random Forest model
importances_best_rf = best_rf_model.feature_importances_
feature_importance_best_rf = pd.DataFrame({
    'Feature': features,
    'Importance': importances_best_rf
}).sort_values(by='Importance', ascending=False)

print("\nBest Random Forest Feature Importances:")
print(feature_importance_best_rf)

# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(data=feature_importance_best_rf, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importances from Best Random Forest Regressor', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.tight_layout()
plt.show()

Starting Grid Search for Random Forest...
Fitting 5 folds for each of 108 candidates, totalling 540 fits

Best parameters found for Random Forest:
{'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}

Best Random Forest Regressor Performance:
Mean Squared Error (MSE): 4985844.50
R-squared: 0.1753

Best Random Forest Feature Importances:
             Feature  Importance
2   Total_Population    0.413187
0  Number_in_Poverty    0.290081
4  Unemployment_Rate    0.142372
3                HPI    0.080071
1       Poverty_Rate    0.074289

# Initialize the model with best parameters
rf_model_cv = RandomForestRegressor(
    n_estimators=best_rf_model.n_estimators,
    max_depth=best_rf_model.max_depth,
    min_samples_split=best_rf_model.min_samples_split,
    min_samples_leaf=best_rf_model.min_samples_leaf,
    random_state=42
)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(rf_model_cv, X, y, cv=5, scoring='r2')

print("\nCross-Validation R-squared Scores:")
print(cv_scores)
print(f"Mean R-squared: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")

Cross-Validation R-squared Scores:
[-0.29415514 -1.14642968  0.17699864  0.26794702  0.27438956]
Mean R-squared: -0.1442
Standard Deviation: 0.5432

# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
poly_features = poly.get_feature_names_out(features)
X_poly = pd.DataFrame(X_poly, columns=poly_features)

# Split the polynomial features
X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Train a new Random Forest model on polynomial features
rf_model_poly = RandomForestRegressor(
    n_estimators=best_rf_model.n_estimators,
    max_depth=best_rf_model.max_depth,
    min_samples_split=best_rf_model.min_samples_split,
    min_samples_leaf=best_rf_model.min_samples_leaf,
    random_state=42
)

rf_model_poly.fit(X_train_poly, y_train_poly)
y_pred_poly = rf_model_poly.predict(X_test_poly)

# Evaluate the new model
mse_poly = mean_squared_error(y_test_poly, y_pred_poly)
r2_poly = r2_score(y_test_poly, y_pred_poly)

print("\nRandom Forest Regressor with Polynomial Features Performance:")
print(f"Mean Squared Error (MSE): {mse_poly:.2f}")
print(f"R-squared: {r2_poly:.4f}")

Random Forest Regressor with Polynomial Features Performance:
Mean Squared Error (MSE): 5076845.21
R-squared: 0.1602

from sklearn.ensemble import GradientBoostingRegressor

# Initialize Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model
gb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_gb = gb_model.predict(X_test)

# Evaluate
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print("\nGradient Boosting Regressor Performance:")
print(f"Mean Squared Error (MSE): {mse_gb:.2f}")
print(f"R-squared: {r2_gb:.4f}")

# Feature Importances from Gradient Boosting
importances_gb = gb_model.feature_importances_
feature_importance_gb = pd.DataFrame({
    'Feature': features,
    'Importance': importances_gb
}).sort_values(by='Importance', ascending=False)

print("\nGradient Boosting Feature Importances:")
print(feature_importance_gb)

# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(data=feature_importance_gb, x='Importance', y='Feature', palette='coolwarm')
plt.title('Feature Importances from Gradient Boosting Regressor', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.tight_layout()
plt.show()

Gradient Boosting Regressor Performance:
Mean Squared Error (MSE): 4773147.68
R-squared: 0.2104

Gradient Boosting Feature Importances:
             Feature  Importance
0  Number_in_Poverty    0.290263
2   Total_Population    0.290033
4  Unemployment_Rate    0.238169
3                HPI    0.111142
1       Poverty_Rate    0.070392

print("The 25th quantile of fraud count is " + str(merged_data_clean["Fraud_Count"].quantile(0.25)))
print("The 50th quantile of fraud count is " + str(merged_data_clean["Fraud_Count"].quantile(0.50)))
print("The 75th quantile of fraud count is " + str(merged_data_clean["Fraud_Count"].quantile(0.75)))

The 25th quantile of fraud count is 13.0
The 50th quantile of fraud count is 126.0
The 75th quantile of fraud count is 562.5

merged_data_clean.head(20)

merged_data_clean['Fraud Rate'] = np.where(merged_data_clean['Fraud_Count'] < 13, "Low", (np.where(merged_data_clean['Fraud_Count'] > 562.5, "High", "Average")))

merged_data_clean.head(20)

merged_data_clean['Fraud Rate'].value_counts()

Fraud Rate
Average    169
High        85
Low         84
Name: count, dtype: int64

from imblearn.over_sampling import SMOTE

# Load your dataset
merged_data_classification = merged_data_clean.drop(["State", "Fraud_Count"], axis=1)

# Separate features and target
X = merged_data_classification.drop('Fraud Rate', axis=1)
y = merged_data_classification['Fraud Rate']

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine resampled features and target into a new DataFrame
balanced_data = pd.DataFrame(X_resampled, columns=X.columns)
balanced_data['Fraud Rate'] = y_resampled

# Making sure that the fraud rate is now balanced across all categories
balanced_data['Fraud Rate'].value_counts()

Fraud Rate
Average    169
High       169
Low        169
Name: count, dtype: int64

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier

# Assume 'target' is the column with fraud rate categories (low, average, high)
X = balanced_data.drop(['Fraud Rate'], axis=1)  
y = balanced_data['Fraud Rate'] 

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of classifiers to evaluate
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Classifier': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

# Evaluate each classifier
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)  # Train the model
    y_pred = clf.predict(X_test)  # Make predictions
    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    
    # Get the classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Extract precision and recall for each class
    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']
    
    # Print the results
    print(f"{name} - Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")


# Visualize a decision tree from the Random Forest
rf_classifier = classifiers['Random Forest']  # Get the trained Random Forest model
tree_to_plot = rf_classifier.estimators_[0]  # Extract the first tree

# Get feature importances
importances = rf_classifier.feature_importances_

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
})

# Sort the DataFrame by importance and reset index
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

# Format the importance scores as percentages
feature_importance_df['Importance'] = feature_importance_df['Importance'].apply(lambda x: f"{x:.2%}")

# Print formatted output
print("\n#### Feature Importance Analysis")
print("\nTop features ranked by importance in the Random Forest model:\n")
print(feature_importance_df.to_string(index=False))

Logistic Regression - Accuracy: 0.53, Precision: 0.52, Recall: 0.53
Random Forest - Accuracy: 0.76, Precision: 0.79, Recall: 0.76
Support Vector Classifier - Accuracy: 0.51, Precision: 0.52, Recall: 0.51
Decision Tree - Accuracy: 0.67, Precision: 0.68, Recall: 0.67
K-Nearest Neighbors - Accuracy: 0.71, Precision: 0.72, Recall: 0.71

#### Feature Importance Analysis

Top features ranked by importance in the Random Forest model:

          Feature Importance
 Total_Population     28.76%
Number_in_Poverty     21.03%
     Poverty_Rate     14.28%
Unemployment_Rate     14.10%
              HPI     12.91%
             Year      8.91%

	data_year	ori	pub_agency_name	pub_agency_unit	agency_type_name	state_abbr	state_name	division_name	county_name	region_name	...	location_name	weapon_code	weapon_name	prop_desc_code	prop_desc_code.1	prop_desc_name	stolen_value	recovered_value	recovered_flag	date_recovered
0	2012	FL0160200	Jacksonville	NaN	City	FL	Florida	South Atlantic	DUVAL	South	...	Not Specified	12	Handgun	8.0	8.0	Consumable Goods	10000.0	10000	True	2012-07-25
1	2012	FL0160200	Jacksonville	NaN	City	FL	Florida	South Atlantic	DUVAL	South	...	Not Specified	12	Handgun	8.0	8.0	Consumable Goods	10000.0	10000	True	2012-07-25
2	2012	FL0160200	Jacksonville	NaN	City	FL	Florida	South Atlantic	DUVAL	South	...	Not Specified	12	Handgun	8.0	8.0	Consumable Goods	10000.0	10000	True	2012-07-25
3	2012	FL0160200	Jacksonville	NaN	City	FL	Florida	South Atlantic	DUVAL	South	...	Not Specified	12	Handgun	8.0	8.0	Consumable Goods	10000.0	10000	True	2012-07-25
4	2012	FL0160200	Jacksonville	NaN	City	FL	Florida	South Atlantic	DUVAL	South	...	Commercial/Office Building	12	Handgun	8.0	8.0	Consumable Goods	10000.0	10000	True	2012-07-25

	data_year	ori	pub_agency_name	pub_agency_unit	agency_type_name	state_abbr	state_name	division_name	county_name	region_name	...	location_name	weapon_code	weapon_name	prop_desc_code	prop_desc_code.1	prop_desc_name	stolen_value	recovered_flag	date_recovered
349	2012	SC0200000	Fairfield	NaN	County	SC	South Carolina	South Atlantic	FAIRFIELD	South	...	Convenience Store	NaN	NaN	9.0	9.0	Credit/ Debit cards	0.0	False	NaN
352	2012	SC0200000	Fairfield	NaN	County	SC	South Carolina	South Atlantic	FAIRFIELD	South	...	Residence/Home	NaN	NaN	9.0	9.0	Credit/ Debit cards	0.0	False	NaN
354	2012	SC0220200	Georgetown	NaN	City	SC	South Carolina	South Atlantic	GEORGETOWN	South	...	Shopping Mall	NaN	NaN	71.0	71.0	Metals, Non-Precious	500.0	False	NaN
433	2012	TN0320100	Morristown	NaN	City	TN	Tennessee	East South Central	HAMBLEN, JEFFERSON	South	...	Department/Discount Store	NaN	NaN	19.0	19.0	Merchandise	1066.0	False	NaN
512	2014	MI8121800	Ann Arbor	NaN	City	MI	Michigan	East North Central	WASHTENAW	Midwest	...	Service/Gas Station	NaN	NaN	9.0	9.0	Credit/ Debit cards	0.0	False	NaN

	State	Year	Number_in_Poverty	Poverty_Rate	Total_Population
0	alabama	2012	777.0	16.2	4808.0
1	alabama	2013	891.0	18.5	4807.0
2	alabama	2014	848.0	17.8	4765.0
3	alabama	2015	784.0	16.3	4820.0
4	alabama	2016	782.0	16.2	4821.0
...	...	...	...	...	...
556	wyoming	2018	53.0	9.4	565.0
557	wyoming	2019	52.0	9.2	565.0
558	wyoming	2020	55.0	9.8	561.0
559	wyoming	2021	53.0	9.3	568.0
560	wyoming	2022	40.0	7.1	562.0

	FIPS Code	State	Area name	2003 Urban Influence Code	2013 Urban Influence Code	2013 Rural-urban Continuum Code	2023 Rural-urban Continuum Code	Less than a high school diploma, 1970	High school diploma only, 1970	Some college (1-3 years), 1970	...	Percent of adults completing some college or associate's degree, 2008-12	Percent of adults with a bachelor's degree or higher, 2008-12	Less than a high school diploma, 2018-22	High school diploma only, 2018-22	Some college or associate's degree, 2018-22	Bachelor's degree or higher, 2018-22	Percent of adults with less than a high school diploma, 2018-22	Percent of adults with a high school diploma only, 2018-22	Percent of adults completing some college or associate's degree, 2018-22	Percent of adults with a bachelor's degree or higher, 2018-22
0	0	US	United States	NaN	NaN	NaN	NaN	52,373,312	34,158,051	11,650,730	...	29.0	28.5	24,599,698	59,741,825	64,508,122	77,751,347	10.9	26.4	28.5	34.3
1	1000	AL	Alabama	NaN	NaN	NaN	NaN	1,062,306	468,269	136,287	...	29.0	22.3	421,180	1,041,725	1,032,770	932,845	12.3	30.4	30.1	27.2
2	1001	AL	Autauga County	2.0	2.0	2.0	2.0	6,611	3,757	933	...	29.6	21.7	3,857	12,517	11,935	11,879	9.6	31.1	29.7	29.6
3	1003	AL	Baldwin County	5.0	2.0	3.0	3.0	18,726	8,426	2,334	...	31.8	27.7	14,031	46,391	52,215	54,385	8.4	27.8	31.3	32.6
4	1005	AL	Barbour County	6.0	6.0	6.0	6.0	8,120	2,242	581	...	25.8	14.5	4,155	6,507	4,913	2,100	23.5	36.8	27.8	11.9
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3289	72145	PR	Vega Baja Municipio	1.0	1.0	1.0	1.0	NaN	NaN	NaN	...	24.0	17.6	9,267	11,653	8,932	10,001	23.3	29.2	22.4	25.1
3290	72147	PR	Vieques Municipio	12.0	12.0	7.0	6.0	NaN	NaN	NaN	...	15.9	10.1	1,668	2,793	681	875	27.7	46.4	11.3	14.5
3291	72149	PR	Villalba Municipio	2.0	2.0	2.0	2.0	NaN	NaN	NaN	...	14.3	15.2	3,274	5,344	3,555	3,358	21.1	34.4	22.9	21.6
3292	72151	PR	Yabucoa Municipio	1.0	1.0	1.0	1.0	NaN	NaN	NaN	...	20.0	12.9	5,674	5,824	6,852	4,147	25.2	25.9	30.5	18.4
3293	72153	PR	Yauco Municipio	2.0	2.0	2.0	2.0	NaN	NaN	NaN	...	17.8	16.8	5,330	8,725	4,354	7,030	21.0	34.3	17.1	27.6

	State	Year	Fraud_Count	Number_in_Poverty	Poverty_Rate	Total_Population
0	alabama	2017	16	735.0	15.3	4801.0
1	alabama	2018	15	779.0	16.0	4877.0
2	alabama	2019	42	627.0	12.9	4873.0
3	alabama	2020	140	744.0	14.9	5001.0
4	alabama	2021	1806	787.0	15.9	4961.0
...	...	...	...	...	...	...
333	wisconsin	2021	218	551.0	9.3	5910.0
334	wisconsin	2022	135	386.0	6.6	5807.0
335	wyoming	2020	1	55.0	9.8	561.0
336	wyoming	2021	2	53.0	9.3	568.0
337	wyoming	2022	6	40.0	7.1	562.0

	State	Year	HPI
0	alabama	2012	341.58
1	alabama	2013	339.66
2	alabama	2014	344.12
3	alabama	2015	352.11
4	alabama	2016	361.39
...	...	...	...
556	wyoming	2018	582.12
557	wyoming	2019	607.60
558	wyoming	2020	630.87
559	wyoming	2021	690.41
560	wyoming	2022	791.12

	State	Year	homeless_rate_change
0	alabama	2022	-11.9
1	arkansas	2022	6.1
2	arizona	2022	5.0
3	california	2022	5.8
4	colorado	2022	38.9
...	...	...	...
779	vermont	2007	218.4
780	washington	2007	19.9
781	wisconsin	2007	-13.9
782	west virginia	2007	-41.2
783	wyoming	2007	-0.9

	State	Year	Fraud_Count	Number_in_Poverty	Poverty_Rate	Total_Population	HPI	Unemployment_Rate
0	alabama	2017	16	735.0	15.3	4801.0	372.82	5.161765
1	alabama	2018	15	779.0	16.0	4877.0	389.30	4.455882
2	alabama	2019	42	627.0	12.9	4873.0	407.03	3.647059
3	alabama	2020	140	744.0	14.9	5001.0	423.50	6.847059
4	alabama	2021	1806	787.0	15.9	4961.0	468.41	3.798529
5	alabama	2022	1465	681.0	13.6	4990.0	543.72	2.969118
6	arizona	2017	3	951.0	13.6	6981.0	589.97	6.743750
7	arizona	2018	1	929.0	12.8	7241.0	635.76	6.525000
8	arizona	2019	127	725.0	10.0	7285.0	679.07	6.456250
9	arizona	2020	179	783.0	10.9	7203.0	723.45	8.825000
10	arizona	2021	76	922.0	12.6	7303.0	860.73	6.100000
11	arizona	2022	367	920.0	12.7	7239.0	1045.92	4.987500
12	arkansas	2013	30	395.0	13.8	2852.0	355.45	7.968421
13	arkansas	2014	209	532.0	18.4	2891.0	360.77	6.681579
14	arkansas	2015	303	475.0	16.1	2951.0	368.99	5.693421
15	arkansas	2016	149	471.0	16.0	2941.0	378.03	4.614474
16	arkansas	2017	266	436.0	14.9	2921.0	391.30	4.275000
17	arkansas	2018	174	462.0	15.9	2912.0	407.81	4.165789
18	arkansas	2019	58	412.0	14.1	2915.0	421.50	4.117105
19	arkansas	2020	106	420.0	14.1	2974.0	432.93	6.457895

Fraud Detection Project¶

Collaboration Plan¶

Overview¶

Technologies Used¶

Current Focus¶

Roadmap & Milestones¶

Milestone 1 – Initial Dataset Selection & ETL¶

Milestone 2 – Additional ETL & Exploratory Data Analysis¶

Deliverable 1 – In-Class Presentation¶

Milestone 3 – Model Development & Evaluation¶

Deliverable 2 – Final Website & Presentation¶

Project Goals¶

1. Fraud Pattern Analysis¶

2. Feature Importance Assessment¶

3. Machine Learning Model Development & Evaluation¶

4. Comprehensive Data Analysis¶

5. Actionable Insights & Final Presentation¶

Initial Exploratory Data Analysis (EDA)¶

Geographic Analysis Overview¶

Regional Analysis Objectives¶

State-Level Analysis¶

Detailed Geographic Investigation¶

Key Observation¶

Demographic and Victim Analysis¶

Multifaceted Investigation¶

Race-Based Analysis¶

Victim Type Classification¶

Strategic Insights¶

Analysis Value¶

Observed Pattern¶

Primary Finding¶

Year-Over-Year Fraud Trends¶

COVID-19 Impact Analysis¶

Macroeconomic Integration¶

House Price Index (HPI) Analysis¶

Data Source Overview¶

HPI Dataset Characteristics¶

Analysis Purpose¶

Strategic Value¶

Integration Benefits¶

HPI Trend Visualization Analysis¶

Visualization Overview¶

Key Observations¶

State-Level HPI Analysis (2012-2022)¶

Key Findings¶

Poverty Dataset Analysis¶

Data Source Overview¶

State Poverty Rate Analysis (2007-2022)¶

Key Findings¶

Homelessness Data Analysis¶

Data Source Overview¶

State Homelessness Trends Analysis (2007-2022)¶

Key Findings¶

Feature Selection Analysis for Fraud Prediction¶

Dataset Integration¶

Result of Initial Model Outputs¶

Strategic Pivot in Modeling Approach¶

New Classification Framework¶

Rationale for the Change¶

Implementation Benefits¶

Shifting to a Categorical Approach¶

New Classification Framework¶

Rationale for the Change¶

Selected Classification Models¶

Model Selection: Random Forest Classifier¶

Advantages of Random Forest for Our Use Case¶

Key Factors in Selection¶

Project Conclusion¶

Skills Application and Learning Outcomes¶

Project Limitations and Challenges¶

Final Achievements¶

Future Recommendations¶