import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

import missingno
import seaborn

plt.style.use('ggplot')

pd.options.display.max_columns = None  # to force pandas to display all columns

pd.options.display.max_rows = None  # to force pandas to display all rows.
# Warning: always remember to use .head() when looking at complete dataset to prevent displaying all rows

pd.options.display.max_colwidth = 100  # to limit max column width. (To remove limit Set to 0.)

Note: This notebook uses the Sentencing dataset from the Cook County Government.

Load Dataset¶

direct_download_link = "https://datacatalog.cookcountyil.gov/api/views/tg8v-tm6u/rows.csv?accessType=DOWNLOAD"
csv_filepath = "Sentencing.csv"
try:
    sentencing_orig = pd.read_csv(csv_filepath,
                                  parse_dates=["DISPOSITION_DATE", "SENTENCE_DATE",
                                                 "INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE",
                                                 "ARREST_DATE", "ARRAIGNMENT_DATE", "RECEIVED_DATE"],
                                  index_col=0)
    print("Loaded dataset from CSV.")
except:
    print("Could not find data. Downloading a ~100MB file. This will take a few minutes.")
    sentencing_orig = pd.read_csv(direct_download_link,
                                  parse_dates=["DISPOSITION_DATE", "SENTENCE_DATE",
                                                 "INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE",
                                                 "ARREST_DATE", "ARRAIGNMENT_DATE", "RECEIVED_DATE"])
    sentencing_orig.to_csv(csv_filepath, index=True)
    print(f"Downloaded CSV and saved to disk as {csv_filepath}")

C:\Users\minur\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3051: DtypeWarning: Columns (8,9,22) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

Loaded dataset from CSV.

# np.all(sentencing_orig_1.fillna("unknown") == sentencing_orig.fillna("unknown"))

# sentencing_orig = pd.read_csv("Sentencing.csv",
#                               parse_dates=["DISPOSITION_DATE", "SENTENCE_DATE", "INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE", "ARREST_DATE", "ARRAIGNMENT_DATE", "RECEIVED_DATE"])

sentencing_orig.shape

(236124, 39)

sentencing_orig.columns

Index(['CASE_ID', 'CASE_PARTICIPANT_ID', 'OFFENSE_CATEGORY', 'PRIMARY_CHARGE',
       'CHARGE_ID', 'CHARGE_VERSION_ID', 'DISPOSITION_CHARGED_OFFENSE_TITLE',
       'DISPOSITION_CHARGED_CHAPTER', 'DISPOSITION_CHARGED_ACT',
       'DISPOSITION_CHARGED_SECTION', 'DISPOSITION_CHARGED_CLASS',
       'DISPOSITION_CHARGED_AOIC', 'DISPOSITION_DATE', 'CHARGE_DISPOSITION',
       'CHARGE_DISPOSITION_REASON', 'SENTENCE_PHASE', 'SENTENCE_DATE',
       'SENTENCE_JUDGE', 'SENTENCE_TYPE', 'CURRENT_SENTENCE',
       'COMMITMENT_TYPE', 'COMMITMENT_TERM', 'COMMITMENT_UNIT', 'COURT_NAME',
       'COURT_FACILITY', 'LENGTH_OF_CASE_in_Days', 'AGE_AT_INCIDENT', 'GENDER',
       'RACE', 'INCIDENT_BEGIN_DATE', 'INCIDENT_END_DATE', 'ARREST_DATE',
       'LAW_ENFORCEMENT_AGENCY', 'UNIT', 'INCIDENT_CITY', 'RECEIVED_DATE',
       'ARRAIGNMENT_DATE', 'UPDATED_OFFENSE_CATEGORY', 'CHARGE_COUNT'],
      dtype='object')

# Preview
sentencing_orig.head()

Categorize features by numeric / ordinal / categorical¶

# categorize features for preprocessing

# features that don't affect the model
drop_features = ["CASE_ID", "CASE_PARTICIPANT_ID", "CHARGE_ID", "CHARGE_VERSION_ID"]

numeric_features = ["LENGTH_OF_CASE_in_Days","AGE_AT_INCIDENT"]

# features to be one-hot encoded
categorical_features = ["OFFENSE_CATEGORY", "DISPOSITION_CHARGED_OFFENSE_TITLE", "CHARGE_DISPOSITION",
                        "SENTENCE_PHASE", "SENTENCE_JUDGE", "SENTENCE_TYPE", "COMMITMENT_TYPE",
                       "COMMITMENT_TYPE", "GENDER", "RACE", "LAW_ENFORCEMENT_AGENCY",
                       "UPDATED_OFFENSE_CATEGORY", "CHARGE_COUNT"] #use one-hot encoding with drop first

# these will also be one-hot encoded
boolean_features = ["CURRENT_SENTENCE", "PRIMARY_CHARGE"]


# ordinal encoding
ordinal_features = []

# to be dropped for now
features_for_later = ["COURT_NAME", "COURT_FACILITY", "INCIDENT_CITY"]

complex_or_unknown_features = ["DISPOSITION_CHARGED_CHAPTER", "DISPOSITION_CHARGED_ACT", "DISPOSITION_CHARGED_SECTION",
                              "DISPOSITION_CHARGED_CLASS", "DISPOSITION_CHARGED_AOIC", "UNIT"]

# to be dropped because not enough examples have feature
too_many_missing_features = ["CHARGE_DISPOSITION_REASON"]

# time features (also will be dropped for now)
time_features = [b"DISPOSITION_DATE", "SENTENCE_DATE", "INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE", "ARREST_DATE",
                "RECEIVED_DATE", "ARRAIGNMENT_DATE"]

# what we are predicting (y)
target = ["COMMITMENT_TERM", "COMMITMENT_UNIT"]

sentencing_orig.dtypes

CASE_ID                                       int64
CASE_PARTICIPANT_ID                           int64
OFFENSE_CATEGORY                             object
PRIMARY_CHARGE                                 bool
CHARGE_ID                                     int64
CHARGE_VERSION_ID                             int64
DISPOSITION_CHARGED_OFFENSE_TITLE            object
DISPOSITION_CHARGED_CHAPTER                  object
DISPOSITION_CHARGED_ACT                      object
DISPOSITION_CHARGED_SECTION                  object
DISPOSITION_CHARGED_CLASS                    object
DISPOSITION_CHARGED_AOIC                     object
DISPOSITION_DATE                             object
CHARGE_DISPOSITION                           object
CHARGE_DISPOSITION_REASON                    object
SENTENCE_PHASE                               object
SENTENCE_DATE                                object
SENTENCE_JUDGE                               object
SENTENCE_TYPE                                object
CURRENT_SENTENCE                               bool
COMMITMENT_TYPE                              object
COMMITMENT_TERM                              object
COMMITMENT_UNIT                              object
COURT_NAME                                   object
COURT_FACILITY                               object
LENGTH_OF_CASE_in_Days                      float64
AGE_AT_INCIDENT                             float64
GENDER                                       object
RACE                                         object
INCIDENT_BEGIN_DATE                  datetime64[ns]
INCIDENT_END_DATE                    datetime64[ns]
ARREST_DATE                          datetime64[ns]
LAW_ENFORCEMENT_AGENCY                       object
UNIT                                         object
INCIDENT_CITY                                object
RECEIVED_DATE                        datetime64[ns]
ARRAIGNMENT_DATE                             object
UPDATED_OFFENSE_CATEGORY                     object
CHARGE_COUNT                                  int64
dtype: object

Drop unused features once processing categorical sentences is complete¶

Features to be dropped:
- Features involving time
  - DISPOSITION_DATE, SENTENCE_DATE, INCIDENT_BEGIN_DATE, INCIDENT_END_DATE, ARREST_DATE, RECEIVED_DATE, ARRAIGNMENT_DATE
- Unique identifiers:
  - CASE_ID, CASE_PARTICIPANT_ID, CHARGE_ID, CHARGE_VERSION_ID, DISPOSITION_CHARGED_CHAPTER
- Features which seem to be useless (mostly ones that dont encode much information / has same value for all examples
  - OFFENSE_CATEGORY¶
- Drop DISPOSITION_CHARGED_OFFENSE_TITLE and related features for two reasons
  - Has far too many unique values
    - UPDATED_OFFENSE_CATEGORY appears to be the final decision and where UPDATED_OFFENSE_CATEGORY differs significantly from DISPOSITION_CHARGED_OFFENSE_TITLE (such as change from First degree murder -> burglary), the sentence seems to match UPDATED_OFFENSE_CATEGORY.
  - Other related features include:
    - DISPOSITION_CHARGED_CHAPTER, DISPOSITION_CHARGED_ACT, DISPOSITION_CHARGED_SECTION, DISPOSITION_CHARGED_CLASS, DISPOSITION_CHARGED_AOIC, DISPOSITION_DATE, CHARGE_DISPOSITION_REASON
- Other features that are not related to the crime(I want a model that tries to not be biased towards factors that shouldn't make a diffference in the sentence). ALSO TRY WITH THESE FEATURES AND SEE HOW CHANGING IT CHANGES THE PREDICTION.
  - SENTENCE_JUDGE, LENGTH_OF_CASE_in_Days, COMMITMENT_TYPE (to be dropped after findLifeSentences), COURT_NAME, COURT_FACILITY, LAW_ENFORCEMENT_AGENCY, UNIT (similar to agency, more specific)

Processing Categorical Sentences¶

Finding Life Sentences¶

Life Sentence can appear in one of two columns (with the other column being NA). Below, I figure out which columns (COMMITMENT_TYPE/COMMITMENT_UNIT) contain this information and pull this out into a new column called 'categorical_sentence'.

#Trying to figure out which columns include 'Life Sentence'

life_anywhere = sentencing_orig.eq('Natural Life').any(axis=1)
life_SENTENCE_PHASE = sentencing_orig["SENTENCE_PHASE"].str.contains("life|Life|LIFE", na=False)       # Total 0
life_COMMITMENT_UNIT = sentencing_orig["COMMITMENT_UNIT"].str.contains("life|Life|LIFE", na=False)     # Total 712
life_COMMITMENT_TERM = sentencing_orig["COMMITMENT_TERM"].str.contains("life|Life|LIFE", na=False)     # Total 0
life_COMMITMENT_TYPE = sentencing_orig["COMMITMENT_TYPE"].str.contains("life|Life|LIFE", na=False)     # Total 20
# life_CURRENT_SENTENCE = sentencing_orig["CURRENT_SENTENCE"].str.contains("life|Life", na=False) # has problems but unlikely to contain life sentence so ignore.
life_SENTENCE_TYPE = sentencing_orig["SENTENCE_TYPE"].str.contains("life|Life|LIFE", na=False)         # Total 0
life_COMMITMENT_UNIT_and_SENTENCE_TYPE = life_COMMITMENT_UNIT & life_COMMITMENT_TYPE                   # Total 1
life_COMMITMENT_UNIT_or_SENTENCE_TYPE = life_COMMITMENT_UNIT | life_COMMITMENT_TYPE                    # Total 731

print(f"Number of rows with the words Natural Life {life_anywhere.sum()}")
print(f"Number of rows with the word life in SENTENCE_PHASE {life_SENTENCE_PHASE.sum()}")
print(f"Number of rows with the word life in COMMITMENT_UNIT {life_COMMITMENT_UNIT.sum()}")
print(f"Number of rows with the word life in COMMITMENT_TERM {life_COMMITMENT_TERM.sum()}")
print(f"Number of rows with the word life in COMMITMENT_TYPE {life_COMMITMENT_TYPE.sum()}")
print(f"Number of rows with the word life in SENTENCE_TYPE {life_SENTENCE_TYPE.sum()}")
print(f"Number of rows with the word life in COMMITMENT_UNIT_and_SENTENCE_TYPE {life_COMMITMENT_UNIT_and_SENTENCE_TYPE.sum()}")
print(f"Number of rows with the word life in COMMITMENT_UNIT_or_SENTENCE_TYPE {life_COMMITMENT_UNIT_or_SENTENCE_TYPE.sum()}")

# Conclusion: Life sentences appear either in COMMITMENT_UNIT or in COMMITMENT_TYPE columns

C:\Users\minur\Anaconda3\lib\site-packages\pandas\core\ops\__init__.py:1115: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  result = method(y)

Number of rows with the words Natural Life 731
Number of rows with the word life in SENTENCE_PHASE 0
Number of rows with the word life in COMMITMENT_UNIT 712
Number of rows with the word life in COMMITMENT_TERM 0
Number of rows with the word life in COMMITMENT_TYPE 20
Number of rows with the word life in SENTENCE_TYPE 0
Number of rows with the word life in COMMITMENT_UNIT_and_SENTENCE_TYPE 1
Number of rows with the word life in COMMITMENT_UNIT_or_SENTENCE_TYPE 731

def processLifeSentences(df):
    """
    Input: Cook county dataset
    Output: Cook county dataset with an additional column called 'categorical_sentence' which is "Life" for a life sentence and "other" for everything else.
    """
    life_COMMITMENT_UNIT = df["COMMITMENT_UNIT"].str.contains("life|Life|LIFE", na=False)
    life_COMMITMENT_TYPE = df["COMMITMENT_TYPE"].str.contains("life|Life|LIFE", na=False)
    life_or_not = life_COMMITMENT_UNIT | life_COMMITMENT_TYPE
    life_or_not = life_or_not.replace(True, "Life")
    life_or_not = life_or_not.replace(False, "Other")
    df.loc[:,"categorical_sentence"] = life_or_not
    
    return df

select_columns = ["DISPOSITION_CHARGED_OFFENSE_TITLE", "UPDATED_OFFENSE_CATEGORY", "COMMITMENT_TYPE", "COMMITMENT_TERM", "COMMITMENT_UNIT",  "categorical_sentence"]

processed_life_sentences = processLifeSentences(sentencing_orig)

# Verify that 731 rows have Life sentence in the 'categorical_sentence' column
(processed_life_sentences["categorical_sentence"]=="Life").sum()

731

# Verify that no rows have been dropped
len(sentencing_orig) - len(processed_life_sentences)

0

# Preview a sample of the dataframe. Only selected columns displayed. Note the new column 'categorical_sentence'
processed_life_sentences.head(15)[select_columns]

Drop any examples where the sentence is death.¶

Doing this primarily because this dataset clearly has errors with death sentences.
- Run this query: sentencing_orig.query("SENTENCE_TYPE == 'Death'").groupby("DISPOSITION_CHARGED_OFFENSE_TITLE")["CASE_ID"].nunique()
- Out of the 64 examples, only 5 involve homicide. The others are clearly incorrect examples are for Credit Card Cases' and and 'Retail Fraud'
- Exception is where the row also has a "Natural Life" in the COMMITMENT_UNIT column. These rows are kept but classified as Life sentence - not death.

print(processed_life_sentences["SENTENCE_TYPE"].str.contains("Death|death|DEATH", na=False).sum())
print(processed_life_sentences["COMMITMENT_TYPE"].str.contains("Death|death|DEATH", na=False).sum())

63
9

def dropDeathSentence(df):
    """
    Input: Cook County Dataset after it has been processed by processLifeSentences and the 'categorical_sentence' column is added. 
    Output: Input dataset minus any rows involving death sentence because most of these are clearly errors (see notes).
    """

    #death_in_SENTENCE_TYPE = df.query("SENTENCE_TYPE == 'Death'")      # 63 examples
    death_in_SENTENCE_TYPE = df["SENTENCE_TYPE"].str.contains("Death|death|DEATH", na=False)
    
    #death_in_COMMITMENT_TYPE = df.query("COMMITMENT_TYPE == 'Death'")  # 9 examples
    death_in_COMMITMENT_TYPE = df["COMMITMENT_TYPE"].str.contains("Death|death|DEATH", na=False)

    incorrect_death_sentences = death_in_SENTENCE_TYPE | death_in_COMMITMENT_TYPE
    
    # Life sentences which should not be dropped even if it appears as a death sentence. See notes.
    life_sentences_mask = df["categorical_sentence"] == "Life"

    return df[~incorrect_death_sentences|life_sentences_mask]

no_death_sentences = dropDeathSentence(processed_life_sentences)

# Number of rows removed
len(sentencing_orig) - len(no_death_sentences)

56

# Verify no rows with death sentences remain except where it is also a Life sentence
print(((no_death_sentences["SENTENCE_TYPE"].str.contains("Death|death|DEATH", na=False)) & (no_death_sentences["categorical_sentence"]!="Life")).sum())
print(((no_death_sentences["COMMITMENT_TYPE"].str.contains("Death|death|DEATH", na=False)) & (no_death_sentences["categorical_sentence"]!="Life")).sum())

0
0

Figuring out other categorical sentence type¶

Examples with the following COMMITMENT_TYPE should be dropped because it is either:

Derived from information that is not available in this table
- Mental Health Probation
- Any NaN excluding for life sentence
- Gang Probation
- 2nd Chance Probation
- Veteran's Court Probation
- Inpatient Mental Health Services
- Repeat Offender Probation
Too few examples (<50)
- Conditional Release
- Domestic Violence Probation
- Repeat Offender Probation
- Periodic Imprisonment
- Home Confinement
- Probation Terminated Unsatisfactorily
- Probation Terminated Instanter
NaN (except for life sentence)

Keep only these COMMITMENT_TYPE (rename ones that are grouped to the group name):

"Prison"
- Illinois Department of Corrections
- Cook County Department of Corrections
"Boot Camp"
- Cook County Boot Camp
- Cook County Impact Incarceration Program
"Probation"
- Probation
- 710/410 Probation (Probation for drug offenders with no prior criminal record)
"Intensive Probation Services"
"Conditional Discharge"
"Court Supervision"
"Intensive Drug Probation Services"
"Drug Court Probation"
"Gang Probation"
"Sex Offender Probation"
"Drug School"
"Juvenile IDOC"

def processCOMMITMENT_TYPE(df):
    """
    Input: Cook County dataset which includes "categorical_sentence" column added by processLifeSentences.
    Output: Keeps only rows with select COMMITMENT_TYPE (see notes for reasons)  and drop rest.
            Fill the column categorical_sentence with the same value as in COMMITMENT_TYPE (except for a few which are grouped together; see notes) 
    """
    
    commitment_types_to_keep = ["Illinois Department of Corrections",
                                "Cook County Department of Corrections",
                                "Cook County Boot Camp",
                                "Cook County Impact Incarceration Program",
                                "Probation",
                                "710/410 Probation",
                                "Intensive Probation Services",
                                "Conditional Discharge",
                                "Court Supervision",
                                "Intensive Drug Probation Services",
                                "Drug Court Probation",
                                "Gang Probation",
                                "Sex Offender Probation",
                                "Drug School",
                                "Juvenile IDOC"]
    commitment_types_mask = df["COMMITMENT_TYPE"].isin(commitment_types_to_keep)
    life_sentences_mask = df["categorical_sentence"] == "Life"  # Boolean array
    
    # Filter only select COMMITMENT_TYPE or life sentences
    df = df.loc[commitment_types_mask | life_sentences_mask]
    
    life_sentences_mask = df["categorical_sentence"] == "Life"  # Update life sentences mask after removing select categories

    # COMMITMENT_TYPE is inconsistent for life sentence; sometimes shows COMMITMENT_TYPE as 'Prison' for life sentences.
    # For my purposes, a life sentence is different from a regular prison sentence. 
    # Whereever COMMITMENT_TYPE was 'Prison' for life sentences, overwrite that value as 'Life' before copying over to 'categorical_sentence' column.
    
#     updated_categorical_sentence = pd.Series(df["COMMITMENT_TYPE"])  # do pd.DataFrame to avoid settingwithcopywarning which should not be given since I want to modify the copy.
#     *********** UPDATED BELOW ***********
    updated_categorical_sentence = df["COMMITMENT_TYPE"].copy()  # force a copy which will be modified and pasted over categorical_sentence
    updated_categorical_sentence.loc[life_sentences_mask] = "Life"  # Overwrite any 
    df.loc[:,"categorical_sentence"] = updated_categorical_sentence  # update categorical_sentence with new categories
    
    # Some categories have multiple names. Rename to a common one.
    df = df.replace({'categorical_sentence':
                     {"Illinois Department of Corrections": "Prison",
                      "Cook County Department of Corrections": "Prison",
                      "Cook County Boot Camp": "Boot Camp",
                      "Cook County Impact Incarceration Program": "Boot Camp",
                      "710/410 Probation": "Probation"}})

    return df

processed_commitment_type = processCOMMITMENT_TYPE(no_death_sentences)

C:\Users\minur\Anaconda3\lib\site-packages\pandas\core\indexing.py:494: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s

# Preview after adding other categories to 'categorical_sentence'
processed_commitment_type.head(15)[select_columns]

# Note that any rows which have a COMMITMENT_TYPE not in the selected list are dropped
len(no_death_sentences) - len(processed_commitment_type)

4729

Calculating sentence period in years¶

Note: Inspected and verified that the only categories in 'categorical_sentence' which do not have COMMITMENT_TERM or COMMITMENT_UNIT are "Life" and "Boot Camp" which should not have a time duration. There are some instances of Life and Boot Camp having a time duration, but these are ignored because they do not make sense here.

Cleaning rules:

For 'Life' and 'Boot Camp', the COMMITMENT_TERM and COMMITMENT_UNIT columns are ignored if they are not NaN. The sentence_period_years column will always be NaN for these examples.
The below rules apply only all categories except 'Life' and 'Boot Camp'
- If the COMMITMENT_TERM fails coercion to number by pandas, remove row. There appear to be a few examples where there are errors in entry (such as including the month in COMMITMENT_TERM). These are very few cases, so dropping them is the simplest thing to do.
- If the COMMITMENT_UNIT is not one of ['Year(s)', 'Months', 'Weeks', 'Days'], drop row.
- If the sentence_period_years is greater than 300 years, discard row. (Doing this because I found two examples where the sentence was 2.0e6 for non-homicide.)

# Split dataset into two parts: one with only 'Life'/'Boot Camp' and the other with the remaining categorical_sentence
no_duration_set = processed_commitment_type.query("categorical_sentence == 'Life' | categorical_sentence =='Boot Camp'")
duration_set = processed_commitment_type.query("categorical_sentence != 'Life' & categorical_sentence !='Boot Camp'")

# Verify that together the two sets added together are not missing any rows 
len(duration_set)+len(no_duration_set)-len(processed_commitment_type)

0

# Confirm that there are no examples with missing COMMITMENT_TERM or COMMITMENT_UNIT
categories = duration_set["categorical_sentence"].unique()
for category in categories:
    rows_for_category = duration_set.query("categorical_sentence == @category")
    rows_missing_duration = rows_for_category[duration_set['COMMITMENT_TERM'].isna() | duration_set['COMMITMENT_UNIT'].isna()]
    
    print(f"""{category}: {len(rows_missing_duration)}""")

C:\Users\minur\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """

Prison: 0
Probation: 0
Court Supervision: 0
Conditional Discharge: 0
Intensive Probation Services: 0
Drug Court Probation: 0
Intensive Drug Probation Services: 0
Gang Probation: 0
Sex Offender Probation: 0
Drug School: 0
Juvenile IDOC: 0

# filter only COMMITMENT_UNIT with ['Year(s)', 'Months', 'Weeks', 'Days']
proper_units_mask = duration_set['COMMITMENT_UNIT'].isin(['Year(s)', 'Months', 'Weeks', 'Days'])
duration_set_filtered_unit = duration_set[proper_units_mask]
print(len(duration_set) - len(duration_set_filtered_unit))

270

# filter only rows with numeric COMMITMENT_TERM
num_temp = pd.to_numeric(duration_set_filtered_unit["COMMITMENT_TERM"], errors="coerce")
duration_set_filtered_unit_term = pd.DataFrame(duration_set_filtered_unit)
duration_set_filtered_unit_term.loc[:,"COMMITMENT_TERM"] = num_temp
duration_set_filtered_unit_term = duration_set_filtered_unit_term.dropna(subset=["COMMITMENT_TERM"])
print(len(duration_set_filtered_unit) - len(duration_set_filtered_unit_term))

4

# Verify COMMITMENT_UNIT includes only those that were filtered
duration_set_filtered_unit_term["COMMITMENT_UNIT"].unique()

array(['Year(s)', 'Days', 'Months', 'Weeks'], dtype=object)

(no_duration_set['categorical_sentence']=="Life").sum()

731

# Convert all units to years

all_years = duration_set_filtered_unit_term.copy()

def convert_to_years(df, unit, multiplier):
    """
    input: Cook County dataset which includes "categorical_sentence" column added by processLifeSentences.
    output: Adds a column called 'sentence_period_years' to dataframe.
            For COMMITMENT_UNIT which matches 'unit', fill in sentence_period_years with the value in COMMITMENT_TERM * multiplier
            Leaves all other rows untouched.
    """

    if ("sentence_period_years" not in df.columns):
        all_years["sentence_period_years"] = np.NaN  # add column which will be updated with sentence period in years
        
    multiplied_units = df["COMMITMENT_TERM"].copy() * multiplier  # do for all rows including ones not matching 'unit'
    unit_mask = df["COMMITMENT_UNIT"] == unit

#     print(unit_mask.head())
    df.loc[unit_mask, "sentence_period_years"] = multiplied_units.loc[unit_mask]
    return df

if "sentence_period_years" not in select_columns:
    select_columns.append("sentence_period_years")

all_years["COMMITMENT_UNIT"].unique()

array(['Year(s)', 'Days', 'Months', 'Weeks'], dtype=object)

all_years = convert_to_years(all_years, "Year(s)", 1)
all_years = convert_to_years(all_years, "Months", 1/12)
all_years = convert_to_years(all_years, "Weeks", 1/52)
all_years = convert_to_years(all_years, "Days", 1/365.25)

# Verify that no rows are dropped
len(duration_set_filtered_unit_term) - len(all_years)

0

# Preview after all processing is done
all_years.tail(10)[select_columns]

# Merge no_duration_set (which contains rows with Life/Boot Camp) with all_years (which is the processed duration_set)

# First add 'sentence_period_years' column to no_duration_set and set all rows to NaN
no_duration_set.loc[:,'sentence_period_years'] = np.NaN

# merge no_duration_set and all_years
sentencing_processed = no_duration_set.append(all_years)

C:\Users\minur\Anaconda3\lib\site-packages\pandas\core\indexing.py:376: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)

# Verify that all rows were added
(len(sentencing_processed)-len(all_years)) - len(no_duration_set)

0

Finished Processing¶

# Processing statistics
len_o = len(sentencing_orig)
len_p = len(sentencing_processed)
len_dropped = len_o - len_p
print(f"{len_dropped} rows from the original dataset were dropped")
print(f"This amounts to {100*len_dropped/len_o:.2f}% of the original dataset")

5059 rows from the original dataset were dropped
This amounts to 2.14% of the original dataset

# Random check for errors in data processing (Take random rows from the processed dataset and compare the original columns to the original dataset)

# Reload sentencing_orig be sure we are comparing to the original data
sentencing_orig = pd.read_csv(csv_filepath,
                                  parse_dates=["DISPOSITION_DATE", "SENTENCE_DATE",
                                                 "INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE",
                                                 "ARREST_DATE", "ARRAIGNMENT_DATE", "RECEIVED_DATE"],
                                  index_col=0)

for i in range(1000):
    random_row_number_in_processed_data = np.random.randint(1,len(sentencing_processed) + 1) # max = len(sentencing_processed) = [1,231066]
    index_number_of_row = sentencing_processed.iloc[random_row_number_in_processed_data-1:random_row_number_in_processed_data].index[0]
    original_row = sentencing_orig.loc[index_number_of_row] # note we are indexing by ID now
    processed_row = sentencing_processed.loc[index_number_of_row]

    # drop categorical sentence since it is only in processed_row.
    compare_df = pd.DataFrame(original_row).T.append(pd.DataFrame(processed_row).T, sort=True).dropna(axis=1) #.drop(['categorical_sentence'], axis=1)  <- need this if sentencing_orig has been modified to include categorical_sentence
    compare_df.loc[:,"COMMITMENT_TERM"] = pd.to_numeric(compare_df.loc[:,"COMMITMENT_TERM"])
    try:
        assert np.all(compare_df.iloc[0]==compare_df.iloc[1]), "error - compare not equal"
    except:
        print(compare_df.iloc[0]==compare_df.iloc[1])
        print(f"i = {i}")
        assert False, "Just to break the loop so that I can see which values are different"

print("Success! All tested rows are equal to the original.")

Success! All tested rows are equal to the original.

sentencing_processed["RACE"].value_counts()

Black                               153570
White [Hispanic or Latino]           35148
White                                32456
HISPANIC                              5744
Asian                                 1291
White/Black [Hispanic or Latino]      1123
Unknown                                324
American Indian                        121
ASIAN                                   65
Biracial                                33
Name: RACE, dtype: int64

Fix incorrect race¶

RACE	Count
Black	153570
White [Hispanic or Latino]	35148
White	32456
HISPANIC	5744
Asian	1291
White/Black [Hispanic or Latino]	1123
Unknown	324
American Indian	121
ASIAN	65
Biracial	33

Merge [HISPANIC, White [Hispanic or Latino], White/Black [Hispanic or Latino]]
Merge [ASIAN, Asian]

# Replace race names
sentencing_processed = sentencing_processed.replace({'RACE':
                                                     {"White [Hispanic or Latino]": "Hispanic",
                                                      "HISPANIC": "Hispanic",
                                                      "White/Black [Hispanic or Latino]": "Hispanic",
                                                      "ASIAN": "Asian",
                                                      "710/410 Probation": "Probation"}})

# Fill race = NaN with 'Unknown'
missing_race = sentencing_processed["RACE"].isna()
sentencing_processed.loc[missing_race,"RACE"] = "Unknown"

Fix possibly incorrect sentence period¶

# Limit max sentence_period_years to 100 years. About 30 rows affected
limit_years_mask = sentencing_processed["sentence_period_years"] > 100
sentencing_processed.loc[limit_years_mask, "sentence_period_years"] =  100
# sentencing_processed.sort_values(by=["sentence_period_years"], ascending=False)[["sentence_period_years", "UPDATED_OFFENSE_CATEGORY", "categorical_sentence"]].head(50)

Fix possibly incorrect age at incident¶

# Replace age>100 with median age
median_age = np.median(sentencing_processed["AGE_AT_INCIDENT"].dropna())

limit_age_mask = sentencing_processed["AGE_AT_INCIDENT"] > 100
sentencing_processed.loc[limit_age_mask, "AGE_AT_INCIDENT"] =  median_age

np.max(sentencing_processed["AGE_AT_INCIDENT"])  # should be < 100

86.0

# Save processed data to disk
sentencing_processed.to_csv("Sentencing_processed_data.csv")  # when reading, remember to set index_column=0

Appendix¶

Haven't cleaned this up but there is some useful information which should not be deleted.

List of possible errors in dataset¶

CASE_ID 113570699133 - sentenced to death for posession of cannabis

Commitment term / unit processing¶

Only keep rows with commitment unit of [Days, Months, Natural Life]
Sentence can be:
- a time in days, months or years
  - Time period can be taken from (COMMITMENT_TERM, COMMITMENT_UNIT) columns.
- or one of three categorical outcomes:
  - Death (Ignore)
  - Life Sentence
  - Cook County Boot Camp
Reason why 'Death' is ignored:
- sentencing_orig[sentencing_orig.eq('Death').any(axis=1)]
- 'Death' appears only 9 times under COMMITMENT_TYPE and 63 times under SENTENCE_TYPE.
- Decided to ignore Death penalty because all of It appears to be clearly incorrect.
- The two different ways of looking for Death sentences with some overlap in the examples gotten from each method (see below).
- When Death appears under SENTENCE_TYPE:
  - sentencing_orig.query("SENTENCE_TYPE=='Death'")
  - This is likely incorrect because many of the charges are fairly minor (possesion of cannabis, DUI, robbery, etc.) and do not include murder which is mandatory according to https://deathpenaltyinfo.org/facts-and-research/crimes-punishable-by-death . I checked that the CASE_PARTICIPANT_ID does not also include other crimes (CASE_COUNT mostly 1) which could have explained the death penalty through the other charges (although this shouldn't really be a valid reason as each crime gets an individual sentence).
  - 61/63 examples include (COMMITMENT_TERM, COMMITMENT_UNIT) information that is a time period (Of the ones without a time period, 1 has (NaN, NaN) and one has (1, Term) instead)
- When Death appears under SENTENCE_TYPE:
  - sentencing_orig.query("COMMITMENT_TYPE=='Death'")
  - Under COMMITMENT_TYPE, it appears to have (COMMITMENT_TERM,COMMITMENT_UNIT) of (1, Natural Life) for 7 examples and 1 each of (NaN, NaN) and (0, Term).
  - Again, only 5/9 examples include murder. This site shows that it is impossible to be sentenced to death for a non-murder in Illinois https://deathpenaltyinfo.org/facts-and-research/crimes-punishable-by-death/death-penalty-for-offenses-other-than-murder
- Since almost all of these examples (both types) include a time period and since the death penalty looks unlikely in many of these cases, I will ignore the death penalty and just take the time period in (COMMITMENT_TERM, COMMITMENT_UNIT)
How to decide whether sentence is 'Natural Life':
- "Life sentence' expressed in several different ways in this dataset:
  - COMMITMENT_UNIT=='Natural Life' = 712
  - COMMITMENT_TYPE=='Natural Life' and COMMITMENT_UNIT=='NaN'; count = 20
- it seems you can have COMMITMENT_TYPE='Natural Life' or you can have COMMITMENT_UNIT='Natural Life'
Assumption: For crimes that involve multiple charges, we assume that each conviction is sentenced independantly of the others and is neither influenced by the other nor is the maximum sentence for a single charge applied to each of the individual charges. The opposite of this can be easily proven by inspection (i.e. pick a CASE_ID that has multiple charges and you can find different sentences for each of the charges. Whether or not this holds for all CASE_IDs is not known but is likely from what I have read about how sentencing works.
Drop all convictions that involve cannabis because the sentencing has likely changed over the years.
Something to look at would be to see if the number of convictions for a single crime affects the sentence. To do this, replace the CHARGE_COUNT for a single CASE_ID, with the highest count OR manually count the number of charges for a single CASE_ID for each CASE_ID. Another thing to try would be to only take the charge which has PRIMARY_CHARGE=True and drop all of the other rows for the CASE_ID.

Categorical outcomes: Life sentence, Cook County Boot Camp

Todo¶

Column descriptions: https://datacatalog.cookcountyil.gov/Courts/Sentencing/tg8v-tm6u

Exploratory Data Analsis¶

Look at the data manually and try to find rare charges for which we have too little information to make predictions and remove these examples OR use only the top 10 charges and ignore the rest.
PLot a covariance matrix of sentence length vs each of the other variables. It will be interesting to see how things like case duration and age at time of sentence corellate to sentence length.
Plot Sankey diagrams of the following:
- Left: what people were arrested, Right: what they are charged for.
- Left: charge disposition title, right: sentence duration discretized to 5 year blocks (make sure the order on the right is correct).
- Left: percentage of race by population of cook county/ Chicago, Right: percentage of race as represented in this dataset

Machine Learning:¶

Look at the proper way to determine a life sentence (i.e. which columns, etc) -> Use life_COMMITMENT_UNIT==True OR life_COMMITMENT_TYPE==True
Look at the proper way to determine a cook county boot camp (i.e. which columns, etc)
Figure out a conversion for the time for prison sentences (must convert days, months, years to a unique value)
How to deal with all of the possible charges (DISPOSITION_CHARGED_OFFENSE_TITLE) has ~1600 possible charges. See this.
Should I use classification or regression here? Even though a sentence is in a continuous unit (time), only a very few values seem to be used. For instance, in the worst case senario, there are only ~2000 unique sentences in this dataset of ~200,000 which is around 1% of unique cases. This data, however is ordinal and if we use a naive classification method, we will be losing information. A solution to this would be to use the method described here. If using classification, we could also simplify the problem and decrease weeor by binning any sentences that appear very close together (e.g. use 1 dimensional k-means on the log transform of the prediction (otherwise, you will end up with smaller cluster for the longer duration sentences because even if they change by 1 year, this will be much more than a sentences in the range of say 1 week and 90 days which appear to be more different to each other than the 1 year change for a 30 year sentence.
- Using classification also solves the problem of dealing with life sentences and boot camp with the continuous time duration sentences.
- Also try regression, again with log transformed predictions (because again, we care more about the relative differences in time periods). I think random forest regression would work much better than ridge/lasso because the targets are not continuously distributed.
For life sentence / boot camp:
- try a fixed duration like 50 years for life sentence (look at whether Chicago has a limit on life sentence, if yes, use that limit)
- [Use average life expectancy of a person in the US] - [age at arrest]
- for boot camp try 0 years
See which offense type has the lowest R^2 value - this might be an indicator of crimes which may be the most inconsistently judged
Run several versions of the algorithm
- One with everything except sentence length
- One whose objective is to predict sentence given the true crime (which is established by the judge) but no other features that depend on the case itself (case duration, charge_disposition_title, charge_disposition, any other features that incluse *disposition*, SENTENCE_JUDGE, etc.)
- One with only the information available at time of arrest (remove everything feature after arrest, including the features that were kept above)
Try using XGBoost

len(sentencing_orig)

236124

Law facts¶

A crime may bring multiple charges and sentences, but usually only the longest sentence is served because of the 'No Double punishment rule'

Things about the legal system that are hard to believe¶

"For example, in one California case, a defendant kidnapped a woman, drove her into the desert, and raped her. He was convicted of kidnapping and rape, but because the court found that these crimes were part of a single, continuous criminal plan, he could be punished for only one conviction (kidnapping, which carried the longer sentence). But from a victim’s point of view, surely being kidnapped and then raped is more damaging than a kidnapping without a rape; yet the defendant who does both will be punished as if the rape had not occurred." [https://www.lawyers.com/legal-info/criminal/criminal-law-basics/how-do-multiple-convictions-affect-my-sentence.html]
This also seems to work sometimes for separately commited offenses so it makes sense to try to get as many offenses into a single trial [https://www.law.ox.ac.uk/centres-institutes/centre-criminology/blog/2016/01/more-one-crime-sentencing-multiple-offences]

	CASE_ID	CASE_PARTICIPANT_ID	OFFENSE_CATEGORY	PRIMARY_CHARGE	CHARGE_ID	CHARGE_VERSION_ID	DISPOSITION_CHARGED_OFFENSE_TITLE	DISPOSITION_CHARGED_CHAPTER	DISPOSITION_CHARGED_ACT	DISPOSITION_CHARGED_SECTION	DISPOSITION_CHARGED_CLASS	DISPOSITION_CHARGED_AOIC	DISPOSITION_DATE	CHARGE_DISPOSITION	CHARGE_DISPOSITION_REASON	SENTENCE_PHASE	SENTENCE_DATE	SENTENCE_JUDGE	SENTENCE_TYPE	CURRENT_SENTENCE	COMMITMENT_TYPE	COMMITMENT_TERM	COMMITMENT_UNIT	COURT_NAME	COURT_FACILITY	LENGTH_OF_CASE_in_Days	AGE_AT_INCIDENT	GENDER	RACE	INCIDENT_BEGIN_DATE	INCIDENT_END_DATE	ARREST_DATE	LAW_ENFORCEMENT_AGENCY	UNIT	INCIDENT_CITY	RECEIVED_DATE	ARRAIGNMENT_DATE	UPDATED_OFFENSE_CATEGORY	CHARGE_COUNT
0	44670309710	218297158761	PROMIS Conversion	False	297139645442	83571817251	FIRST DEGREE MURDER	38	-	9-1(a)(2)	X	0000001607	12/17/2014 12:00:00 AM	Nolle On Remand	NaN	Original Sentencing	6/2/1986 12:00:00 AM	John Mannion	Conversion	True	Natural Life	NaN	NaN	District 6 - Markham	Markham Courthouse	619.0	27.0	Male	Black	1984-08-09	NaT	1984-08-15	CHICAGO POLICE DEPT	NaN	NaN	1984-08-15	9/21/1984 12:00:00 AM	Homicide	2
1	44670309710	218297158761	PROMIS Conversion	False	297176911341	94830742153	FIRST DEGREE MURDER	38	-	9-1(a)(3)	X	0000001608	12/17/2014 12:00:00 AM	Nolle On Remand	NaN	Original Sentencing	6/2/1986 12:00:00 AM	John Mannion	Conversion	True	Natural Life	NaN	NaN	District 6 - Markham	Markham Courthouse	619.0	27.0	Male	Black	1984-08-09	NaT	1984-08-15	CHICAGO POLICE DEPT	NaN	NaN	1984-08-15	9/21/1984 12:00:00 AM	Homicide	4
2	44670309710	218297158761	PROMIS Conversion	False	297177207102	94826043158	FIRST DEGREE MURDER	38	-	9-1(a)(3)	X	0000001608	12/17/2014 12:00:00 AM	Nolle On Remand	NaN	Original Sentencing	6/2/1986 12:00:00 AM	John Mannion	Conversion	True	Natural Life	NaN	NaN	District 6 - Markham	Markham Courthouse	619.0	27.0	Male	Black	1984-08-09	NaT	1984-08-15	CHICAGO POLICE DEPT	NaN	NaN	1984-08-15	9/21/1984 12:00:00 AM	Homicide	5
3	44670309710	218297158761	PROMIS Conversion	False	297178094385	70461958445	HOME INVASION	38-12-11-A(1)	NaN	NaN	X	0000001846	12/17/2014 12:00:00 AM	Plea Of Guilty	NaN	Amended/Corrected Sentencing	10/16/2014 12:00:00 AM	Clayton Jay Crane	Prison	True	Illinois Department of Corrections	30	Year(s)	District 6 - Markham	Markham Courthouse	10982.0	27.0	Male	Black	1984-08-09	NaT	1984-08-15	CHICAGO POLICE DEPT	NaN	NaN	1984-08-15	9/21/1984 12:00:00 AM	Homicide	13
4	44670309710	218297158761	PROMIS Conversion	False	297178094385	70461958445	HOME INVASION	38-12-11-A(1)	NaN	NaN	X	0000001846	12/17/2014 12:00:00 AM	Plea Of Guilty	NaN	Original Sentencing	6/2/1986 12:00:00 AM	John Mannion	Conversion	False	Illinois Department of Corrections	30	Year(s)	District 6 - Markham	Markham Courthouse	619.0	27.0	Male	Black	1984-08-09	NaT	1984-08-15	CHICAGO POLICE DEPT	NaN	NaN	1984-08-15	9/21/1984 12:00:00 AM	Homicide	13

	DISPOSITION_CHARGED_OFFENSE_TITLE	UPDATED_OFFENSE_CATEGORY	COMMITMENT_TYPE	COMMITMENT_TERM	COMMITMENT_UNIT	categorical_sentence	sentence_period_years
236111	CRIMINAL TRESPASS TO VEHICLES	Possession of Stolen Motor Vehicle	Cook County Department of Corrections	2.0	Days	Prison	0.005476
236112	AGGRAVATED UNLAWFUL USE OF WEAPON	UUW - Unlawful Use of Weapon	Probation	2.0	Year(s)	Probation	2.000000
236113	AGGRAVATED BATTERY	Aggravated Battery Police Officer	Probation	2.0	Year(s)	Probation	2.000000
236114	AGG UUW/LOADED/NO FCCA/FOID	UUW - Unlawful Use of Weapon	Cook County Department of Corrections	1.0	Year(s)	Prison	1.000000
236115	ATTEMPT POSSESSION OF A CONTROLLED SUBSTANCE	Narcotics	Cook County Department of Corrections	39.0	Days	Prison	0.106776
236117	ESCAPE; FAILURE TO COMPLY WITH A CONDITION OF THE ELECTRONIC HOME MONITORING DETENTION PROGRAM	Escape - Failure to Return	Illinois Department of Corrections	2.0	Year(s)	Prison	2.000000
236119	PUBLIC INDECENCY/EXPOSURE/3+	Other Offense	Illinois Department of Corrections	2.0	Year(s)	Prison	2.000000
236120	MFG/DEL CANNABIS/2.5-10 GRAMS	Narcotics	Illinois Department of Corrections	2.0	Days	Prison	0.005476
236122	AGGRAVATED DRIVING UNDER THE INFLUENCE OF ALCOHOL	Aggravated DUI	Illinois Department of Corrections	1.0	Year(s)	Prison	1.000000
236123	VIOLATION OF THE SEX OFFENDER REGISTRATION ACT	Failure to Register as a Sex Offender	Illinois Department of Corrections	3.0	Year(s)	Prison	3.000000