import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import missingno
import seaborn
plt.style.use('ggplot')
pd.options.display.max_columns = None # to force pandas to display all columns
pd.options.display.max_rows = None # to force pandas to display all rows.
# Warning: always remember to use .head() when looking at complete dataset to prevent displaying all rows
pd.options.display.max_colwidth = 100 # to limit max column width. (To remove limit Set to 0.)
Note: This notebook uses the Sentencing dataset from the Cook County Government.
direct_download_link = "https://datacatalog.cookcountyil.gov/api/views/tg8v-tm6u/rows.csv?accessType=DOWNLOAD"
csv_filepath = "Sentencing.csv"
try:
sentencing_orig = pd.read_csv(csv_filepath,
parse_dates=["DISPOSITION_DATE", "SENTENCE_DATE",
"INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE",
"ARREST_DATE", "ARRAIGNMENT_DATE", "RECEIVED_DATE"],
index_col=0)
print("Loaded dataset from CSV.")
except:
print("Could not find data. Downloading a ~100MB file. This will take a few minutes.")
sentencing_orig = pd.read_csv(direct_download_link,
parse_dates=["DISPOSITION_DATE", "SENTENCE_DATE",
"INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE",
"ARREST_DATE", "ARRAIGNMENT_DATE", "RECEIVED_DATE"])
sentencing_orig.to_csv(csv_filepath, index=True)
print(f"Downloaded CSV and saved to disk as {csv_filepath}")
# np.all(sentencing_orig_1.fillna("unknown") == sentencing_orig.fillna("unknown"))
# sentencing_orig = pd.read_csv("Sentencing.csv",
# parse_dates=["DISPOSITION_DATE", "SENTENCE_DATE", "INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE", "ARREST_DATE", "ARRAIGNMENT_DATE", "RECEIVED_DATE"])
sentencing_orig.shape
sentencing_orig.columns
# Preview
sentencing_orig.head()
# categorize features for preprocessing
# features that don't affect the model
drop_features = ["CASE_ID", "CASE_PARTICIPANT_ID", "CHARGE_ID", "CHARGE_VERSION_ID"]
numeric_features = ["LENGTH_OF_CASE_in_Days","AGE_AT_INCIDENT"]
# features to be one-hot encoded
categorical_features = ["OFFENSE_CATEGORY", "DISPOSITION_CHARGED_OFFENSE_TITLE", "CHARGE_DISPOSITION",
"SENTENCE_PHASE", "SENTENCE_JUDGE", "SENTENCE_TYPE", "COMMITMENT_TYPE",
"COMMITMENT_TYPE", "GENDER", "RACE", "LAW_ENFORCEMENT_AGENCY",
"UPDATED_OFFENSE_CATEGORY", "CHARGE_COUNT"] #use one-hot encoding with drop first
# these will also be one-hot encoded
boolean_features = ["CURRENT_SENTENCE", "PRIMARY_CHARGE"]
# ordinal encoding
ordinal_features = []
# to be dropped for now
features_for_later = ["COURT_NAME", "COURT_FACILITY", "INCIDENT_CITY"]
complex_or_unknown_features = ["DISPOSITION_CHARGED_CHAPTER", "DISPOSITION_CHARGED_ACT", "DISPOSITION_CHARGED_SECTION",
"DISPOSITION_CHARGED_CLASS", "DISPOSITION_CHARGED_AOIC", "UNIT"]
# to be dropped because not enough examples have feature
too_many_missing_features = ["CHARGE_DISPOSITION_REASON"]
# time features (also will be dropped for now)
time_features = [b"DISPOSITION_DATE", "SENTENCE_DATE", "INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE", "ARREST_DATE",
"RECEIVED_DATE", "ARRAIGNMENT_DATE"]
# what we are predicting (y)
target = ["COMMITMENT_TERM", "COMMITMENT_UNIT"]
sentencing_orig.dtypes
Life Sentence can appear in one of two columns (with the other column being NA). Below, I figure out which columns (COMMITMENT_TYPE/COMMITMENT_UNIT) contain this information and pull this out into a new column called 'categorical_sentence'.
#Trying to figure out which columns include 'Life Sentence'
life_anywhere = sentencing_orig.eq('Natural Life').any(axis=1)
life_SENTENCE_PHASE = sentencing_orig["SENTENCE_PHASE"].str.contains("life|Life|LIFE", na=False) # Total 0
life_COMMITMENT_UNIT = sentencing_orig["COMMITMENT_UNIT"].str.contains("life|Life|LIFE", na=False) # Total 712
life_COMMITMENT_TERM = sentencing_orig["COMMITMENT_TERM"].str.contains("life|Life|LIFE", na=False) # Total 0
life_COMMITMENT_TYPE = sentencing_orig["COMMITMENT_TYPE"].str.contains("life|Life|LIFE", na=False) # Total 20
# life_CURRENT_SENTENCE = sentencing_orig["CURRENT_SENTENCE"].str.contains("life|Life", na=False) # has problems but unlikely to contain life sentence so ignore.
life_SENTENCE_TYPE = sentencing_orig["SENTENCE_TYPE"].str.contains("life|Life|LIFE", na=False) # Total 0
life_COMMITMENT_UNIT_and_SENTENCE_TYPE = life_COMMITMENT_UNIT & life_COMMITMENT_TYPE # Total 1
life_COMMITMENT_UNIT_or_SENTENCE_TYPE = life_COMMITMENT_UNIT | life_COMMITMENT_TYPE # Total 731
print(f"Number of rows with the words Natural Life {life_anywhere.sum()}")
print(f"Number of rows with the word life in SENTENCE_PHASE {life_SENTENCE_PHASE.sum()}")
print(f"Number of rows with the word life in COMMITMENT_UNIT {life_COMMITMENT_UNIT.sum()}")
print(f"Number of rows with the word life in COMMITMENT_TERM {life_COMMITMENT_TERM.sum()}")
print(f"Number of rows with the word life in COMMITMENT_TYPE {life_COMMITMENT_TYPE.sum()}")
print(f"Number of rows with the word life in SENTENCE_TYPE {life_SENTENCE_TYPE.sum()}")
print(f"Number of rows with the word life in COMMITMENT_UNIT_and_SENTENCE_TYPE {life_COMMITMENT_UNIT_and_SENTENCE_TYPE.sum()}")
print(f"Number of rows with the word life in COMMITMENT_UNIT_or_SENTENCE_TYPE {life_COMMITMENT_UNIT_or_SENTENCE_TYPE.sum()}")
# Conclusion: Life sentences appear either in COMMITMENT_UNIT or in COMMITMENT_TYPE columns
def processLifeSentences(df):
"""
Input: Cook county dataset
Output: Cook county dataset with an additional column called 'categorical_sentence' which is "Life" for a life sentence and "other" for everything else.
"""
life_COMMITMENT_UNIT = df["COMMITMENT_UNIT"].str.contains("life|Life|LIFE", na=False)
life_COMMITMENT_TYPE = df["COMMITMENT_TYPE"].str.contains("life|Life|LIFE", na=False)
life_or_not = life_COMMITMENT_UNIT | life_COMMITMENT_TYPE
life_or_not = life_or_not.replace(True, "Life")
life_or_not = life_or_not.replace(False, "Other")
df.loc[:,"categorical_sentence"] = life_or_not
return df
select_columns = ["DISPOSITION_CHARGED_OFFENSE_TITLE", "UPDATED_OFFENSE_CATEGORY", "COMMITMENT_TYPE", "COMMITMENT_TERM", "COMMITMENT_UNIT", "categorical_sentence"]
processed_life_sentences = processLifeSentences(sentencing_orig)
# Verify that 731 rows have Life sentence in the 'categorical_sentence' column
(processed_life_sentences["categorical_sentence"]=="Life").sum()
# Verify that no rows have been dropped
len(sentencing_orig) - len(processed_life_sentences)
# Preview a sample of the dataframe. Only selected columns displayed. Note the new column 'categorical_sentence'
processed_life_sentences.head(15)[select_columns]
print(processed_life_sentences["SENTENCE_TYPE"].str.contains("Death|death|DEATH", na=False).sum())
print(processed_life_sentences["COMMITMENT_TYPE"].str.contains("Death|death|DEATH", na=False).sum())
def dropDeathSentence(df):
"""
Input: Cook County Dataset after it has been processed by processLifeSentences and the 'categorical_sentence' column is added.
Output: Input dataset minus any rows involving death sentence because most of these are clearly errors (see notes).
"""
#death_in_SENTENCE_TYPE = df.query("SENTENCE_TYPE == 'Death'") # 63 examples
death_in_SENTENCE_TYPE = df["SENTENCE_TYPE"].str.contains("Death|death|DEATH", na=False)
#death_in_COMMITMENT_TYPE = df.query("COMMITMENT_TYPE == 'Death'") # 9 examples
death_in_COMMITMENT_TYPE = df["COMMITMENT_TYPE"].str.contains("Death|death|DEATH", na=False)
incorrect_death_sentences = death_in_SENTENCE_TYPE | death_in_COMMITMENT_TYPE
# Life sentences which should not be dropped even if it appears as a death sentence. See notes.
life_sentences_mask = df["categorical_sentence"] == "Life"
return df[~incorrect_death_sentences|life_sentences_mask]
no_death_sentences = dropDeathSentence(processed_life_sentences)
# Number of rows removed
len(sentencing_orig) - len(no_death_sentences)
# Verify no rows with death sentences remain except where it is also a Life sentence
print(((no_death_sentences["SENTENCE_TYPE"].str.contains("Death|death|DEATH", na=False)) & (no_death_sentences["categorical_sentence"]!="Life")).sum())
print(((no_death_sentences["COMMITMENT_TYPE"].str.contains("Death|death|DEATH", na=False)) & (no_death_sentences["categorical_sentence"]!="Life")).sum())
Examples with the following COMMITMENT_TYPE should be dropped because it is either:
Keep only these COMMITMENT_TYPE (rename ones that are grouped to the group name):
def processCOMMITMENT_TYPE(df):
"""
Input: Cook County dataset which includes "categorical_sentence" column added by processLifeSentences.
Output: Keeps only rows with select COMMITMENT_TYPE (see notes for reasons) and drop rest.
Fill the column categorical_sentence with the same value as in COMMITMENT_TYPE (except for a few which are grouped together; see notes)
"""
commitment_types_to_keep = ["Illinois Department of Corrections",
"Cook County Department of Corrections",
"Cook County Boot Camp",
"Cook County Impact Incarceration Program",
"Probation",
"710/410 Probation",
"Intensive Probation Services",
"Conditional Discharge",
"Court Supervision",
"Intensive Drug Probation Services",
"Drug Court Probation",
"Gang Probation",
"Sex Offender Probation",
"Drug School",
"Juvenile IDOC"]
commitment_types_mask = df["COMMITMENT_TYPE"].isin(commitment_types_to_keep)
life_sentences_mask = df["categorical_sentence"] == "Life" # Boolean array
# Filter only select COMMITMENT_TYPE or life sentences
df = df.loc[commitment_types_mask | life_sentences_mask]
life_sentences_mask = df["categorical_sentence"] == "Life" # Update life sentences mask after removing select categories
# COMMITMENT_TYPE is inconsistent for life sentence; sometimes shows COMMITMENT_TYPE as 'Prison' for life sentences.
# For my purposes, a life sentence is different from a regular prison sentence.
# Whereever COMMITMENT_TYPE was 'Prison' for life sentences, overwrite that value as 'Life' before copying over to 'categorical_sentence' column.
# updated_categorical_sentence = pd.Series(df["COMMITMENT_TYPE"]) # do pd.DataFrame to avoid settingwithcopywarning which should not be given since I want to modify the copy.
# *********** UPDATED BELOW ***********
updated_categorical_sentence = df["COMMITMENT_TYPE"].copy() # force a copy which will be modified and pasted over categorical_sentence
updated_categorical_sentence.loc[life_sentences_mask] = "Life" # Overwrite any
df.loc[:,"categorical_sentence"] = updated_categorical_sentence # update categorical_sentence with new categories
# Some categories have multiple names. Rename to a common one.
df = df.replace({'categorical_sentence':
{"Illinois Department of Corrections": "Prison",
"Cook County Department of Corrections": "Prison",
"Cook County Boot Camp": "Boot Camp",
"Cook County Impact Incarceration Program": "Boot Camp",
"710/410 Probation": "Probation"}})
return df
processed_commitment_type = processCOMMITMENT_TYPE(no_death_sentences)
# Preview after adding other categories to 'categorical_sentence'
processed_commitment_type.head(15)[select_columns]
# Note that any rows which have a COMMITMENT_TYPE not in the selected list are dropped
len(no_death_sentences) - len(processed_commitment_type)
Note: Inspected and verified that the only categories in 'categorical_sentence' which do not have COMMITMENT_TERM or COMMITMENT_UNIT are "Life" and "Boot Camp" which should not have a time duration. There are some instances of Life and Boot Camp having a time duration, but these are ignored because they do not make sense here.
Cleaning rules:
For 'Life' and 'Boot Camp', the COMMITMENT_TERM and COMMITMENT_UNIT columns are ignored if they are not NaN. The sentence_period_years column will always be NaN for these examples.
The below rules apply only all categories except 'Life' and 'Boot Camp'
# Split dataset into two parts: one with only 'Life'/'Boot Camp' and the other with the remaining categorical_sentence
no_duration_set = processed_commitment_type.query("categorical_sentence == 'Life' | categorical_sentence =='Boot Camp'")
duration_set = processed_commitment_type.query("categorical_sentence != 'Life' & categorical_sentence !='Boot Camp'")
# Verify that together the two sets added together are not missing any rows
len(duration_set)+len(no_duration_set)-len(processed_commitment_type)
# Confirm that there are no examples with missing COMMITMENT_TERM or COMMITMENT_UNIT
categories = duration_set["categorical_sentence"].unique()
for category in categories:
rows_for_category = duration_set.query("categorical_sentence == @category")
rows_missing_duration = rows_for_category[duration_set['COMMITMENT_TERM'].isna() | duration_set['COMMITMENT_UNIT'].isna()]
print(f"""{category}: {len(rows_missing_duration)}""")
# filter only COMMITMENT_UNIT with ['Year(s)', 'Months', 'Weeks', 'Days']
proper_units_mask = duration_set['COMMITMENT_UNIT'].isin(['Year(s)', 'Months', 'Weeks', 'Days'])
duration_set_filtered_unit = duration_set[proper_units_mask]
print(len(duration_set) - len(duration_set_filtered_unit))
# filter only rows with numeric COMMITMENT_TERM
num_temp = pd.to_numeric(duration_set_filtered_unit["COMMITMENT_TERM"], errors="coerce")
duration_set_filtered_unit_term = pd.DataFrame(duration_set_filtered_unit)
duration_set_filtered_unit_term.loc[:,"COMMITMENT_TERM"] = num_temp
duration_set_filtered_unit_term = duration_set_filtered_unit_term.dropna(subset=["COMMITMENT_TERM"])
print(len(duration_set_filtered_unit) - len(duration_set_filtered_unit_term))
# Verify COMMITMENT_UNIT includes only those that were filtered
duration_set_filtered_unit_term["COMMITMENT_UNIT"].unique()
(no_duration_set['categorical_sentence']=="Life").sum()
# Convert all units to years
all_years = duration_set_filtered_unit_term.copy()
def convert_to_years(df, unit, multiplier):
"""
input: Cook County dataset which includes "categorical_sentence" column added by processLifeSentences.
output: Adds a column called 'sentence_period_years' to dataframe.
For COMMITMENT_UNIT which matches 'unit', fill in sentence_period_years with the value in COMMITMENT_TERM * multiplier
Leaves all other rows untouched.
"""
if ("sentence_period_years" not in df.columns):
all_years["sentence_period_years"] = np.NaN # add column which will be updated with sentence period in years
multiplied_units = df["COMMITMENT_TERM"].copy() * multiplier # do for all rows including ones not matching 'unit'
unit_mask = df["COMMITMENT_UNIT"] == unit
# print(unit_mask.head())
df.loc[unit_mask, "sentence_period_years"] = multiplied_units.loc[unit_mask]
return df
if "sentence_period_years" not in select_columns:
select_columns.append("sentence_period_years")
all_years["COMMITMENT_UNIT"].unique()
all_years = convert_to_years(all_years, "Year(s)", 1)
all_years = convert_to_years(all_years, "Months", 1/12)
all_years = convert_to_years(all_years, "Weeks", 1/52)
all_years = convert_to_years(all_years, "Days", 1/365.25)
# Verify that no rows are dropped
len(duration_set_filtered_unit_term) - len(all_years)
# Preview after all processing is done
all_years.tail(10)[select_columns]
# Merge no_duration_set (which contains rows with Life/Boot Camp) with all_years (which is the processed duration_set)
# First add 'sentence_period_years' column to no_duration_set and set all rows to NaN
no_duration_set.loc[:,'sentence_period_years'] = np.NaN
# merge no_duration_set and all_years
sentencing_processed = no_duration_set.append(all_years)
# Verify that all rows were added
(len(sentencing_processed)-len(all_years)) - len(no_duration_set)
# Processing statistics
len_o = len(sentencing_orig)
len_p = len(sentencing_processed)
len_dropped = len_o - len_p
print(f"{len_dropped} rows from the original dataset were dropped")
print(f"This amounts to {100*len_dropped/len_o:.2f}% of the original dataset")
# Random check for errors in data processing (Take random rows from the processed dataset and compare the original columns to the original dataset)
# Reload sentencing_orig be sure we are comparing to the original data
sentencing_orig = pd.read_csv(csv_filepath,
parse_dates=["DISPOSITION_DATE", "SENTENCE_DATE",
"INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE",
"ARREST_DATE", "ARRAIGNMENT_DATE", "RECEIVED_DATE"],
index_col=0)
for i in range(1000):
random_row_number_in_processed_data = np.random.randint(1,len(sentencing_processed) + 1) # max = len(sentencing_processed) = [1,231066]
index_number_of_row = sentencing_processed.iloc[random_row_number_in_processed_data-1:random_row_number_in_processed_data].index[0]
original_row = sentencing_orig.loc[index_number_of_row] # note we are indexing by ID now
processed_row = sentencing_processed.loc[index_number_of_row]
# drop categorical sentence since it is only in processed_row.
compare_df = pd.DataFrame(original_row).T.append(pd.DataFrame(processed_row).T, sort=True).dropna(axis=1) #.drop(['categorical_sentence'], axis=1) <- need this if sentencing_orig has been modified to include categorical_sentence
compare_df.loc[:,"COMMITMENT_TERM"] = pd.to_numeric(compare_df.loc[:,"COMMITMENT_TERM"])
try:
assert np.all(compare_df.iloc[0]==compare_df.iloc[1]), "error - compare not equal"
except:
print(compare_df.iloc[0]==compare_df.iloc[1])
print(f"i = {i}")
assert False, "Just to break the loop so that I can see which values are different"
print("Success! All tested rows are equal to the original.")
sentencing_processed["RACE"].value_counts()
RACE | Count |
---|---|
Black | 153570 |
White [Hispanic or Latino] | 35148 |
White | 32456 |
HISPANIC | 5744 |
Asian | 1291 |
White/Black [Hispanic or Latino] | 1123 |
Unknown | 324 |
American Indian | 121 |
ASIAN | 65 |
Biracial | 33 |
# Replace race names
sentencing_processed = sentencing_processed.replace({'RACE':
{"White [Hispanic or Latino]": "Hispanic",
"HISPANIC": "Hispanic",
"White/Black [Hispanic or Latino]": "Hispanic",
"ASIAN": "Asian",
"710/410 Probation": "Probation"}})
# Fill race = NaN with 'Unknown'
missing_race = sentencing_processed["RACE"].isna()
sentencing_processed.loc[missing_race,"RACE"] = "Unknown"
# Limit max sentence_period_years to 100 years. About 30 rows affected
limit_years_mask = sentencing_processed["sentence_period_years"] > 100
sentencing_processed.loc[limit_years_mask, "sentence_period_years"] = 100
# sentencing_processed.sort_values(by=["sentence_period_years"], ascending=False)[["sentence_period_years", "UPDATED_OFFENSE_CATEGORY", "categorical_sentence"]].head(50)
# Replace age>100 with median age
median_age = np.median(sentencing_processed["AGE_AT_INCIDENT"].dropna())
limit_age_mask = sentencing_processed["AGE_AT_INCIDENT"] > 100
sentencing_processed.loc[limit_age_mask, "AGE_AT_INCIDENT"] = median_age
np.max(sentencing_processed["AGE_AT_INCIDENT"]) # should be < 100
# Save processed data to disk
sentencing_processed.to_csv("Sentencing_processed_data.csv") # when reading, remember to set index_column=0
Haven't cleaned this up but there is some useful information which should not be deleted.
Sentence can be:
Reason why 'Death' is ignored:
When Death appears under SENTENCE_TYPE:
When Death appears under SENTENCE_TYPE:
Since almost all of these examples (both types) include a time period and since the death penalty looks unlikely in many of these cases, I will ignore the death penalty and just take the time period in (COMMITMENT_TERM, COMMITMENT_UNIT)
How to decide whether sentence is 'Natural Life':
Assumption: For crimes that involve multiple charges, we assume that each conviction is sentenced independantly of the others and is neither influenced by the other nor is the maximum sentence for a single charge applied to each of the individual charges. The opposite of this can be easily proven by inspection (i.e. pick a CASE_ID that has multiple charges and you can find different sentences for each of the charges. Whether or not this holds for all CASE_IDs is not known but is likely from what I have read about how sentencing works.
Drop all convictions that involve cannabis because the sentencing has likely changed over the years.
Something to look at would be to see if the number of convictions for a single crime affects the sentence. To do this, replace the CHARGE_COUNT for a single CASE_ID, with the highest count OR manually count the number of charges for a single CASE_ID for each CASE_ID. Another thing to try would be to only take the charge which has PRIMARY_CHARGE=True and drop all of the other rows for the CASE_ID.
Column descriptions: https://datacatalog.cookcountyil.gov/Courts/Sentencing/tg8v-tm6u
For life sentence / boot camp:
See which offense type has the lowest R^2 value - this might be an indicator of crimes which may be the most inconsistently judged
Run several versions of the algorithm
Try using XGBoost
len(sentencing_orig)