In [124]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
In [125]:
import missingno
import seaborn
In [126]:
plt.style.use('ggplot')
In [127]:
pd.options.display.max_columns = None  # to force pandas to display all columns
In [128]:
pd.options.display.max_rows = None  # to force pandas to display all rows.
# Warning: always remember to use .head() when looking at complete dataset to prevent displaying all rows
In [129]:
pd.options.display.max_colwidth = 100  # to limit max column width. (To remove limit Set to 0.)

Note: This notebook uses the Sentencing dataset from the Cook County Government.

Load Dataset

In [130]:
direct_download_link = "https://datacatalog.cookcountyil.gov/api/views/tg8v-tm6u/rows.csv?accessType=DOWNLOAD"
csv_filepath = "Sentencing.csv"
try:
    sentencing_orig = pd.read_csv(csv_filepath,
                                  parse_dates=["DISPOSITION_DATE", "SENTENCE_DATE",
                                                 "INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE",
                                                 "ARREST_DATE", "ARRAIGNMENT_DATE", "RECEIVED_DATE"],
                                  index_col=0)
    print("Loaded dataset from CSV.")
except:
    print("Could not find data. Downloading a ~100MB file. This will take a few minutes.")
    sentencing_orig = pd.read_csv(direct_download_link,
                                  parse_dates=["DISPOSITION_DATE", "SENTENCE_DATE",
                                                 "INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE",
                                                 "ARREST_DATE", "ARRAIGNMENT_DATE", "RECEIVED_DATE"])
    sentencing_orig.to_csv(csv_filepath, index=True)
    print(f"Downloaded CSV and saved to disk as {csv_filepath}")
C:\Users\minur\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3051: DtypeWarning: Columns (8,9,22) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Loaded dataset from CSV.
In [131]:
# np.all(sentencing_orig_1.fillna("unknown") == sentencing_orig.fillna("unknown"))
In [132]:
# sentencing_orig = pd.read_csv("Sentencing.csv",
#                               parse_dates=["DISPOSITION_DATE", "SENTENCE_DATE", "INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE", "ARREST_DATE", "ARRAIGNMENT_DATE", "RECEIVED_DATE"])
In [133]:
sentencing_orig.shape
Out[133]:
(236124, 39)
In [134]:
sentencing_orig.columns
Out[134]:
Index(['CASE_ID', 'CASE_PARTICIPANT_ID', 'OFFENSE_CATEGORY', 'PRIMARY_CHARGE',
       'CHARGE_ID', 'CHARGE_VERSION_ID', 'DISPOSITION_CHARGED_OFFENSE_TITLE',
       'DISPOSITION_CHARGED_CHAPTER', 'DISPOSITION_CHARGED_ACT',
       'DISPOSITION_CHARGED_SECTION', 'DISPOSITION_CHARGED_CLASS',
       'DISPOSITION_CHARGED_AOIC', 'DISPOSITION_DATE', 'CHARGE_DISPOSITION',
       'CHARGE_DISPOSITION_REASON', 'SENTENCE_PHASE', 'SENTENCE_DATE',
       'SENTENCE_JUDGE', 'SENTENCE_TYPE', 'CURRENT_SENTENCE',
       'COMMITMENT_TYPE', 'COMMITMENT_TERM', 'COMMITMENT_UNIT', 'COURT_NAME',
       'COURT_FACILITY', 'LENGTH_OF_CASE_in_Days', 'AGE_AT_INCIDENT', 'GENDER',
       'RACE', 'INCIDENT_BEGIN_DATE', 'INCIDENT_END_DATE', 'ARREST_DATE',
       'LAW_ENFORCEMENT_AGENCY', 'UNIT', 'INCIDENT_CITY', 'RECEIVED_DATE',
       'ARRAIGNMENT_DATE', 'UPDATED_OFFENSE_CATEGORY', 'CHARGE_COUNT'],
      dtype='object')
In [135]:
# Preview
sentencing_orig.head()
Out[135]:
CASE_ID CASE_PARTICIPANT_ID OFFENSE_CATEGORY PRIMARY_CHARGE CHARGE_ID CHARGE_VERSION_ID DISPOSITION_CHARGED_OFFENSE_TITLE DISPOSITION_CHARGED_CHAPTER DISPOSITION_CHARGED_ACT DISPOSITION_CHARGED_SECTION DISPOSITION_CHARGED_CLASS DISPOSITION_CHARGED_AOIC DISPOSITION_DATE CHARGE_DISPOSITION CHARGE_DISPOSITION_REASON SENTENCE_PHASE SENTENCE_DATE SENTENCE_JUDGE SENTENCE_TYPE CURRENT_SENTENCE COMMITMENT_TYPE COMMITMENT_TERM COMMITMENT_UNIT COURT_NAME COURT_FACILITY LENGTH_OF_CASE_in_Days AGE_AT_INCIDENT GENDER RACE INCIDENT_BEGIN_DATE INCIDENT_END_DATE ARREST_DATE LAW_ENFORCEMENT_AGENCY UNIT INCIDENT_CITY RECEIVED_DATE ARRAIGNMENT_DATE UPDATED_OFFENSE_CATEGORY CHARGE_COUNT
0 44670309710 218297158761 PROMIS Conversion False 297139645442 83571817251 FIRST DEGREE MURDER 38 - 9-1(a)(2) X 0000001607 12/17/2014 12:00:00 AM Nolle On Remand NaN Original Sentencing 6/2/1986 12:00:00 AM John Mannion Conversion True Natural Life NaN NaN District 6 - Markham Markham Courthouse 619.0 27.0 Male Black 1984-08-09 NaT 1984-08-15 CHICAGO POLICE DEPT NaN NaN 1984-08-15 9/21/1984 12:00:00 AM Homicide 2
1 44670309710 218297158761 PROMIS Conversion False 297176911341 94830742153 FIRST DEGREE MURDER 38 - 9-1(a)(3) X 0000001608 12/17/2014 12:00:00 AM Nolle On Remand NaN Original Sentencing 6/2/1986 12:00:00 AM John Mannion Conversion True Natural Life NaN NaN District 6 - Markham Markham Courthouse 619.0 27.0 Male Black 1984-08-09 NaT 1984-08-15 CHICAGO POLICE DEPT NaN NaN 1984-08-15 9/21/1984 12:00:00 AM Homicide 4
2 44670309710 218297158761 PROMIS Conversion False 297177207102 94826043158 FIRST DEGREE MURDER 38 - 9-1(a)(3) X 0000001608 12/17/2014 12:00:00 AM Nolle On Remand NaN Original Sentencing 6/2/1986 12:00:00 AM John Mannion Conversion True Natural Life NaN NaN District 6 - Markham Markham Courthouse 619.0 27.0 Male Black 1984-08-09 NaT 1984-08-15 CHICAGO POLICE DEPT NaN NaN 1984-08-15 9/21/1984 12:00:00 AM Homicide 5
3 44670309710 218297158761 PROMIS Conversion False 297178094385 70461958445 HOME INVASION 38-12-11-A(1) NaN NaN X 0000001846 12/17/2014 12:00:00 AM Plea Of Guilty NaN Amended/Corrected Sentencing 10/16/2014 12:00:00 AM Clayton Jay Crane Prison True Illinois Department of Corrections 30 Year(s) District 6 - Markham Markham Courthouse 10982.0 27.0 Male Black 1984-08-09 NaT 1984-08-15 CHICAGO POLICE DEPT NaN NaN 1984-08-15 9/21/1984 12:00:00 AM Homicide 13
4 44670309710 218297158761 PROMIS Conversion False 297178094385 70461958445 HOME INVASION 38-12-11-A(1) NaN NaN X 0000001846 12/17/2014 12:00:00 AM Plea Of Guilty NaN Original Sentencing 6/2/1986 12:00:00 AM John Mannion Conversion False Illinois Department of Corrections 30 Year(s) District 6 - Markham Markham Courthouse 619.0 27.0 Male Black 1984-08-09 NaT 1984-08-15 CHICAGO POLICE DEPT NaN NaN 1984-08-15 9/21/1984 12:00:00 AM Homicide 13

Categorize features by numeric / ordinal / categorical

In [136]:
# categorize features for preprocessing

# features that don't affect the model
drop_features = ["CASE_ID", "CASE_PARTICIPANT_ID", "CHARGE_ID", "CHARGE_VERSION_ID"]

numeric_features = ["LENGTH_OF_CASE_in_Days","AGE_AT_INCIDENT"]

# features to be one-hot encoded
categorical_features = ["OFFENSE_CATEGORY", "DISPOSITION_CHARGED_OFFENSE_TITLE", "CHARGE_DISPOSITION",
                        "SENTENCE_PHASE", "SENTENCE_JUDGE", "SENTENCE_TYPE", "COMMITMENT_TYPE",
                       "COMMITMENT_TYPE", "GENDER", "RACE", "LAW_ENFORCEMENT_AGENCY",
                       "UPDATED_OFFENSE_CATEGORY", "CHARGE_COUNT"] #use one-hot encoding with drop first

# these will also be one-hot encoded
boolean_features = ["CURRENT_SENTENCE", "PRIMARY_CHARGE"]


# ordinal encoding
ordinal_features = []

# to be dropped for now
features_for_later = ["COURT_NAME", "COURT_FACILITY", "INCIDENT_CITY"]

complex_or_unknown_features = ["DISPOSITION_CHARGED_CHAPTER", "DISPOSITION_CHARGED_ACT", "DISPOSITION_CHARGED_SECTION",
                              "DISPOSITION_CHARGED_CLASS", "DISPOSITION_CHARGED_AOIC", "UNIT"]

# to be dropped because not enough examples have feature
too_many_missing_features = ["CHARGE_DISPOSITION_REASON"]

# time features (also will be dropped for now)
time_features = [b"DISPOSITION_DATE", "SENTENCE_DATE", "INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE", "ARREST_DATE",
                "RECEIVED_DATE", "ARRAIGNMENT_DATE"]

# what we are predicting (y)
target = ["COMMITMENT_TERM", "COMMITMENT_UNIT"]
In [137]:
sentencing_orig.dtypes
Out[137]:
CASE_ID                                       int64
CASE_PARTICIPANT_ID                           int64
OFFENSE_CATEGORY                             object
PRIMARY_CHARGE                                 bool
CHARGE_ID                                     int64
CHARGE_VERSION_ID                             int64
DISPOSITION_CHARGED_OFFENSE_TITLE            object
DISPOSITION_CHARGED_CHAPTER                  object
DISPOSITION_CHARGED_ACT                      object
DISPOSITION_CHARGED_SECTION                  object
DISPOSITION_CHARGED_CLASS                    object
DISPOSITION_CHARGED_AOIC                     object
DISPOSITION_DATE                             object
CHARGE_DISPOSITION                           object
CHARGE_DISPOSITION_REASON                    object
SENTENCE_PHASE                               object
SENTENCE_DATE                                object
SENTENCE_JUDGE                               object
SENTENCE_TYPE                                object
CURRENT_SENTENCE                               bool
COMMITMENT_TYPE                              object
COMMITMENT_TERM                              object
COMMITMENT_UNIT                              object
COURT_NAME                                   object
COURT_FACILITY                               object
LENGTH_OF_CASE_in_Days                      float64
AGE_AT_INCIDENT                             float64
GENDER                                       object
RACE                                         object
INCIDENT_BEGIN_DATE                  datetime64[ns]
INCIDENT_END_DATE                    datetime64[ns]
ARREST_DATE                          datetime64[ns]
LAW_ENFORCEMENT_AGENCY                       object
UNIT                                         object
INCIDENT_CITY                                object
RECEIVED_DATE                        datetime64[ns]
ARRAIGNMENT_DATE                             object
UPDATED_OFFENSE_CATEGORY                     object
CHARGE_COUNT                                  int64
dtype: object

Drop unused features once processing categorical sentences is complete

  • Features to be dropped:
    • Features involving time
      • DISPOSITION_DATE, SENTENCE_DATE, INCIDENT_BEGIN_DATE, INCIDENT_END_DATE, ARREST_DATE, RECEIVED_DATE, ARRAIGNMENT_DATE
    • Unique identifiers:
      • CASE_ID, CASE_PARTICIPANT_ID, CHARGE_ID, CHARGE_VERSION_ID, DISPOSITION_CHARGED_CHAPTER
    • Features which seem to be useless (mostly ones that dont encode much information / has same value for all examples

      - OFFENSE_CATEGORY

    • Drop DISPOSITION_CHARGED_OFFENSE_TITLE and related features for two reasons
      • Has far too many unique values
        • UPDATED_OFFENSE_CATEGORY appears to be the final decision and where UPDATED_OFFENSE_CATEGORY differs significantly from DISPOSITION_CHARGED_OFFENSE_TITLE (such as change from First degree murder -> burglary), the sentence seems to match UPDATED_OFFENSE_CATEGORY.
      • Other related features include:
        • DISPOSITION_CHARGED_CHAPTER, DISPOSITION_CHARGED_ACT, DISPOSITION_CHARGED_SECTION, DISPOSITION_CHARGED_CLASS, DISPOSITION_CHARGED_AOIC, DISPOSITION_DATE, CHARGE_DISPOSITION_REASON
    • Other features that are not related to the crime(I want a model that tries to not be biased towards factors that shouldn't make a diffference in the sentence). ALSO TRY WITH THESE FEATURES AND SEE HOW CHANGING IT CHANGES THE PREDICTION.
      • SENTENCE_JUDGE, LENGTH_OF_CASE_in_Days, COMMITMENT_TYPE (to be dropped after findLifeSentences), COURT_NAME, COURT_FACILITY, LAW_ENFORCEMENT_AGENCY, UNIT (similar to agency, more specific)

Processing Categorical Sentences

Finding Life Sentences

Life Sentence can appear in one of two columns (with the other column being NA). Below, I figure out which columns (COMMITMENT_TYPE/COMMITMENT_UNIT) contain this information and pull this out into a new column called 'categorical_sentence'.

In [138]:
#Trying to figure out which columns include 'Life Sentence'

life_anywhere = sentencing_orig.eq('Natural Life').any(axis=1)
life_SENTENCE_PHASE = sentencing_orig["SENTENCE_PHASE"].str.contains("life|Life|LIFE", na=False)       # Total 0
life_COMMITMENT_UNIT = sentencing_orig["COMMITMENT_UNIT"].str.contains("life|Life|LIFE", na=False)     # Total 712
life_COMMITMENT_TERM = sentencing_orig["COMMITMENT_TERM"].str.contains("life|Life|LIFE", na=False)     # Total 0
life_COMMITMENT_TYPE = sentencing_orig["COMMITMENT_TYPE"].str.contains("life|Life|LIFE", na=False)     # Total 20
# life_CURRENT_SENTENCE = sentencing_orig["CURRENT_SENTENCE"].str.contains("life|Life", na=False) # has problems but unlikely to contain life sentence so ignore.
life_SENTENCE_TYPE = sentencing_orig["SENTENCE_TYPE"].str.contains("life|Life|LIFE", na=False)         # Total 0
life_COMMITMENT_UNIT_and_SENTENCE_TYPE = life_COMMITMENT_UNIT & life_COMMITMENT_TYPE                   # Total 1
life_COMMITMENT_UNIT_or_SENTENCE_TYPE = life_COMMITMENT_UNIT | life_COMMITMENT_TYPE                    # Total 731

print(f"Number of rows with the words Natural Life {life_anywhere.sum()}")
print(f"Number of rows with the word life in SENTENCE_PHASE {life_SENTENCE_PHASE.sum()}")
print(f"Number of rows with the word life in COMMITMENT_UNIT {life_COMMITMENT_UNIT.sum()}")
print(f"Number of rows with the word life in COMMITMENT_TERM {life_COMMITMENT_TERM.sum()}")
print(f"Number of rows with the word life in COMMITMENT_TYPE {life_COMMITMENT_TYPE.sum()}")
print(f"Number of rows with the word life in SENTENCE_TYPE {life_SENTENCE_TYPE.sum()}")
print(f"Number of rows with the word life in COMMITMENT_UNIT_and_SENTENCE_TYPE {life_COMMITMENT_UNIT_and_SENTENCE_TYPE.sum()}")
print(f"Number of rows with the word life in COMMITMENT_UNIT_or_SENTENCE_TYPE {life_COMMITMENT_UNIT_or_SENTENCE_TYPE.sum()}")

# Conclusion: Life sentences appear either in COMMITMENT_UNIT or in COMMITMENT_TYPE columns
C:\Users\minur\Anaconda3\lib\site-packages\pandas\core\ops\__init__.py:1115: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  result = method(y)
Number of rows with the words Natural Life 731
Number of rows with the word life in SENTENCE_PHASE 0
Number of rows with the word life in COMMITMENT_UNIT 712
Number of rows with the word life in COMMITMENT_TERM 0
Number of rows with the word life in COMMITMENT_TYPE 20
Number of rows with the word life in SENTENCE_TYPE 0
Number of rows with the word life in COMMITMENT_UNIT_and_SENTENCE_TYPE 1
Number of rows with the word life in COMMITMENT_UNIT_or_SENTENCE_TYPE 731
In [139]:
def processLifeSentences(df):
    """
    Input: Cook county dataset
    Output: Cook county dataset with an additional column called 'categorical_sentence' which is "Life" for a life sentence and "other" for everything else.
    """
    life_COMMITMENT_UNIT = df["COMMITMENT_UNIT"].str.contains("life|Life|LIFE", na=False)
    life_COMMITMENT_TYPE = df["COMMITMENT_TYPE"].str.contains("life|Life|LIFE", na=False)
    life_or_not = life_COMMITMENT_UNIT | life_COMMITMENT_TYPE
    life_or_not = life_or_not.replace(True, "Life")
    life_or_not = life_or_not.replace(False, "Other")
    df.loc[:,"categorical_sentence"] = life_or_not
    
    return df    
In [140]:
select_columns = ["DISPOSITION_CHARGED_OFFENSE_TITLE", "UPDATED_OFFENSE_CATEGORY", "COMMITMENT_TYPE", "COMMITMENT_TERM", "COMMITMENT_UNIT",  "categorical_sentence"]
In [141]:
processed_life_sentences = processLifeSentences(sentencing_orig)
In [142]:
# Verify that 731 rows have Life sentence in the 'categorical_sentence' column
(processed_life_sentences["categorical_sentence"]=="Life").sum()
Out[142]:
731
In [143]:
# Verify that no rows have been dropped
len(sentencing_orig) - len(processed_life_sentences)
Out[143]:
0
In [144]:
# Preview a sample of the dataframe. Only selected columns displayed. Note the new column 'categorical_sentence'
processed_life_sentences.head(15)[select_columns]
Out[144]:
DISPOSITION_CHARGED_OFFENSE_TITLE UPDATED_OFFENSE_CATEGORY COMMITMENT_TYPE COMMITMENT_TERM COMMITMENT_UNIT categorical_sentence
0 FIRST DEGREE MURDER Homicide Natural Life NaN NaN Life
1 FIRST DEGREE MURDER Homicide Natural Life NaN NaN Life
2 FIRST DEGREE MURDER Homicide Natural Life NaN NaN Life
3 HOME INVASION Homicide Illinois Department of Corrections 30 Year(s) Other
4 HOME INVASION Homicide Illinois Department of Corrections 30 Year(s) Other
5 HOME INVASION Homicide Illinois Department of Corrections 30 Year(s) Other
6 ARMED ROBBERY Homicide Illinois Department of Corrections 30 Year(s) Other
7 ARMED ROBBERY Homicide Illinois Department of Corrections 30 Year(s) Other
8 ARMED ROBBERY Homicide Illinois Department of Corrections 30 Year(s) Other
9 FIRST DEGREE MURDER Homicide Illinois Department of Corrections 62 Year(s) Other
10 FIRST DEGREE MURDER Homicide Natural Life NaN NaN Life
11 ATTEMPT ARMED ROBBERY=720-5\8-4(18-2(A)) Homicide Illinois Department of Corrections 15 Year(s) Other
12 ATTEMPT ARMED ROBBERY=720-5\8-4(18-2(A)) Homicide Illinois Department of Corrections 15 Year(s) Other
13 ATT(MURDER)=720-5\8-4(9-1(A)(1-3)) Homicide Illinois Department of Corrections 30 Year(s) Other
14 ATT(MURDER)=720-5\8-4(9-1(A)(1-3)) Homicide Illinois Department of Corrections 30 Year(s) Other

Drop any examples where the sentence is death.

  • Doing this primarily because this dataset clearly has errors with death sentences.
    • Run this query: sentencing_orig.query("SENTENCE_TYPE == 'Death'").groupby("DISPOSITION_CHARGED_OFFENSE_TITLE")["CASE_ID"].nunique()
    • Out of the 64 examples, only 5 involve homicide. The others are clearly incorrect examples are for Credit Card Cases' and and 'Retail Fraud'
    • Exception is where the row also has a "Natural Life" in the COMMITMENT_UNIT column. These rows are kept but classified as Life sentence - not death.
In [145]:
print(processed_life_sentences["SENTENCE_TYPE"].str.contains("Death|death|DEATH", na=False).sum())
print(processed_life_sentences["COMMITMENT_TYPE"].str.contains("Death|death|DEATH", na=False).sum())
63
9
In [146]:
def dropDeathSentence(df):
    """
    Input: Cook County Dataset after it has been processed by processLifeSentences and the 'categorical_sentence' column is added. 
    Output: Input dataset minus any rows involving death sentence because most of these are clearly errors (see notes).
    """

    #death_in_SENTENCE_TYPE = df.query("SENTENCE_TYPE == 'Death'")      # 63 examples
    death_in_SENTENCE_TYPE = df["SENTENCE_TYPE"].str.contains("Death|death|DEATH", na=False)
    
    #death_in_COMMITMENT_TYPE = df.query("COMMITMENT_TYPE == 'Death'")  # 9 examples
    death_in_COMMITMENT_TYPE = df["COMMITMENT_TYPE"].str.contains("Death|death|DEATH", na=False)

    incorrect_death_sentences = death_in_SENTENCE_TYPE | death_in_COMMITMENT_TYPE
    
    # Life sentences which should not be dropped even if it appears as a death sentence. See notes.
    life_sentences_mask = df["categorical_sentence"] == "Life"

    return df[~incorrect_death_sentences|life_sentences_mask]
In [147]:
no_death_sentences = dropDeathSentence(processed_life_sentences)
In [148]:
# Number of rows removed
len(sentencing_orig) - len(no_death_sentences)
Out[148]:
56
In [149]:
# Verify no rows with death sentences remain except where it is also a Life sentence
print(((no_death_sentences["SENTENCE_TYPE"].str.contains("Death|death|DEATH", na=False)) & (no_death_sentences["categorical_sentence"]!="Life")).sum())
print(((no_death_sentences["COMMITMENT_TYPE"].str.contains("Death|death|DEATH", na=False)) & (no_death_sentences["categorical_sentence"]!="Life")).sum())
0
0
In [ ]:
 

Figuring out other categorical sentence type

Examples with the following COMMITMENT_TYPE should be dropped because it is either:

  • Derived from information that is not available in this table
    • Mental Health Probation
    • Any NaN excluding for life sentence
    • Gang Probation
    • 2nd Chance Probation
    • Veteran's Court Probation
    • Inpatient Mental Health Services
    • Repeat Offender Probation
  • Too few examples (<50)
    • Conditional Release
    • Domestic Violence Probation
    • Repeat Offender Probation
    • Periodic Imprisonment
    • Home Confinement
    • Probation Terminated Unsatisfactorily
    • Probation Terminated Instanter
  • NaN (except for life sentence)

Keep only these COMMITMENT_TYPE (rename ones that are grouped to the group name):

  • "Prison"
    • Illinois Department of Corrections
    • Cook County Department of Corrections
  • "Boot Camp"
    • Cook County Boot Camp
    • Cook County Impact Incarceration Program
  • "Probation"
    • Probation
    • 710/410 Probation (Probation for drug offenders with no prior criminal record)
  • "Intensive Probation Services"
  • "Conditional Discharge"
  • "Court Supervision"
  • "Intensive Drug Probation Services"
  • "Drug Court Probation"
  • "Gang Probation"
  • "Sex Offender Probation"
  • "Drug School"
  • "Juvenile IDOC"
In [150]:
def processCOMMITMENT_TYPE(df):
    """
    Input: Cook County dataset which includes "categorical_sentence" column added by processLifeSentences.
    Output: Keeps only rows with select COMMITMENT_TYPE (see notes for reasons)  and drop rest.
            Fill the column categorical_sentence with the same value as in COMMITMENT_TYPE (except for a few which are grouped together; see notes) 
    """
    
    commitment_types_to_keep = ["Illinois Department of Corrections",
                                "Cook County Department of Corrections",
                                "Cook County Boot Camp",
                                "Cook County Impact Incarceration Program",
                                "Probation",
                                "710/410 Probation",
                                "Intensive Probation Services",
                                "Conditional Discharge",
                                "Court Supervision",
                                "Intensive Drug Probation Services",
                                "Drug Court Probation",
                                "Gang Probation",
                                "Sex Offender Probation",
                                "Drug School",
                                "Juvenile IDOC"]
    commitment_types_mask = df["COMMITMENT_TYPE"].isin(commitment_types_to_keep)
    life_sentences_mask = df["categorical_sentence"] == "Life"  # Boolean array
    
    # Filter only select COMMITMENT_TYPE or life sentences
    df = df.loc[commitment_types_mask | life_sentences_mask]
    
    life_sentences_mask = df["categorical_sentence"] == "Life"  # Update life sentences mask after removing select categories

    # COMMITMENT_TYPE is inconsistent for life sentence; sometimes shows COMMITMENT_TYPE as 'Prison' for life sentences.
    # For my purposes, a life sentence is different from a regular prison sentence. 
    # Whereever COMMITMENT_TYPE was 'Prison' for life sentences, overwrite that value as 'Life' before copying over to 'categorical_sentence' column.
    
#     updated_categorical_sentence = pd.Series(df["COMMITMENT_TYPE"])  # do pd.DataFrame to avoid settingwithcopywarning which should not be given since I want to modify the copy.
#     *********** UPDATED BELOW ***********
    updated_categorical_sentence = df["COMMITMENT_TYPE"].copy()  # force a copy which will be modified and pasted over categorical_sentence
    updated_categorical_sentence.loc[life_sentences_mask] = "Life"  # Overwrite any 
    df.loc[:,"categorical_sentence"] = updated_categorical_sentence  # update categorical_sentence with new categories
    
    # Some categories have multiple names. Rename to a common one.
    df = df.replace({'categorical_sentence':
                     {"Illinois Department of Corrections": "Prison",
                      "Cook County Department of Corrections": "Prison",
                      "Cook County Boot Camp": "Boot Camp",
                      "Cook County Impact Incarceration Program": "Boot Camp",
                      "710/410 Probation": "Probation"}})

    return df
In [151]:
processed_commitment_type = processCOMMITMENT_TYPE(no_death_sentences)
C:\Users\minur\Anaconda3\lib\site-packages\pandas\core\indexing.py:494: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
In [152]:
# Preview after adding other categories to 'categorical_sentence'
processed_commitment_type.head(15)[select_columns]
Out[152]:
DISPOSITION_CHARGED_OFFENSE_TITLE UPDATED_OFFENSE_CATEGORY COMMITMENT_TYPE COMMITMENT_TERM COMMITMENT_UNIT categorical_sentence
0 FIRST DEGREE MURDER Homicide Natural Life NaN NaN Life
1 FIRST DEGREE MURDER Homicide Natural Life NaN NaN Life
2 FIRST DEGREE MURDER Homicide Natural Life NaN NaN Life
3 HOME INVASION Homicide Illinois Department of Corrections 30 Year(s) Prison
4 HOME INVASION Homicide Illinois Department of Corrections 30 Year(s) Prison
5 HOME INVASION Homicide Illinois Department of Corrections 30 Year(s) Prison
6 ARMED ROBBERY Homicide Illinois Department of Corrections 30 Year(s) Prison
7 ARMED ROBBERY Homicide Illinois Department of Corrections 30 Year(s) Prison
8 ARMED ROBBERY Homicide Illinois Department of Corrections 30 Year(s) Prison
9 FIRST DEGREE MURDER Homicide Illinois Department of Corrections 62 Year(s) Prison
10 FIRST DEGREE MURDER Homicide Natural Life NaN NaN Life
11 ATTEMPT ARMED ROBBERY=720-5\8-4(18-2(A)) Homicide Illinois Department of Corrections 15 Year(s) Prison
12 ATTEMPT ARMED ROBBERY=720-5\8-4(18-2(A)) Homicide Illinois Department of Corrections 15 Year(s) Prison
13 ATT(MURDER)=720-5\8-4(9-1(A)(1-3)) Homicide Illinois Department of Corrections 30 Year(s) Prison
14 ATT(MURDER)=720-5\8-4(9-1(A)(1-3)) Homicide Illinois Department of Corrections 30 Year(s) Prison
In [153]:
# Note that any rows which have a COMMITMENT_TYPE not in the selected list are dropped
len(no_death_sentences) - len(processed_commitment_type)
Out[153]:
4729

Calculating sentence period in years

Note: Inspected and verified that the only categories in 'categorical_sentence' which do not have COMMITMENT_TERM or COMMITMENT_UNIT are "Life" and "Boot Camp" which should not have a time duration. There are some instances of Life and Boot Camp having a time duration, but these are ignored because they do not make sense here.

Cleaning rules:

  • For 'Life' and 'Boot Camp', the COMMITMENT_TERM and COMMITMENT_UNIT columns are ignored if they are not NaN. The sentence_period_years column will always be NaN for these examples.

  • The below rules apply only all categories except 'Life' and 'Boot Camp'

    • If the COMMITMENT_TERM fails coercion to number by pandas, remove row. There appear to be a few examples where there are errors in entry (such as including the month in COMMITMENT_TERM). These are very few cases, so dropping them is the simplest thing to do.
    • If the COMMITMENT_UNIT is not one of ['Year(s)', 'Months', 'Weeks', 'Days'], drop row.
    • If the sentence_period_years is greater than 300 years, discard row. (Doing this because I found two examples where the sentence was 2.0e6 for non-homicide.)
In [154]:
# Split dataset into two parts: one with only 'Life'/'Boot Camp' and the other with the remaining categorical_sentence
no_duration_set = processed_commitment_type.query("categorical_sentence == 'Life' | categorical_sentence =='Boot Camp'")
duration_set = processed_commitment_type.query("categorical_sentence != 'Life' & categorical_sentence !='Boot Camp'")
In [155]:
# Verify that together the two sets added together are not missing any rows 
len(duration_set)+len(no_duration_set)-len(processed_commitment_type)
Out[155]:
0
In [156]:
# Confirm that there are no examples with missing COMMITMENT_TERM or COMMITMENT_UNIT
categories = duration_set["categorical_sentence"].unique()
for category in categories:
    rows_for_category = duration_set.query("categorical_sentence == @category")
    rows_missing_duration = rows_for_category[duration_set['COMMITMENT_TERM'].isna() | duration_set['COMMITMENT_UNIT'].isna()]
    
    print(f"""{category}: {len(rows_missing_duration)}""")
C:\Users\minur\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """
Prison: 0
Probation: 0
Court Supervision: 0
Conditional Discharge: 0
Intensive Probation Services: 0
Drug Court Probation: 0
Intensive Drug Probation Services: 0
Gang Probation: 0
Sex Offender Probation: 0
Drug School: 0
Juvenile IDOC: 0
In [157]:
# filter only COMMITMENT_UNIT with ['Year(s)', 'Months', 'Weeks', 'Days']
proper_units_mask = duration_set['COMMITMENT_UNIT'].isin(['Year(s)', 'Months', 'Weeks', 'Days'])
duration_set_filtered_unit = duration_set[proper_units_mask]
print(len(duration_set) - len(duration_set_filtered_unit))
270
In [158]:
# filter only rows with numeric COMMITMENT_TERM
num_temp = pd.to_numeric(duration_set_filtered_unit["COMMITMENT_TERM"], errors="coerce")
duration_set_filtered_unit_term = pd.DataFrame(duration_set_filtered_unit)
duration_set_filtered_unit_term.loc[:,"COMMITMENT_TERM"] = num_temp
duration_set_filtered_unit_term = duration_set_filtered_unit_term.dropna(subset=["COMMITMENT_TERM"])
print(len(duration_set_filtered_unit) - len(duration_set_filtered_unit_term))
4
In [159]:
# Verify COMMITMENT_UNIT includes only those that were filtered
duration_set_filtered_unit_term["COMMITMENT_UNIT"].unique()
Out[159]:
array(['Year(s)', 'Days', 'Months', 'Weeks'], dtype=object)
In [160]:
(no_duration_set['categorical_sentence']=="Life").sum()
Out[160]:
731
In [161]:
# Convert all units to years

all_years = duration_set_filtered_unit_term.copy()

def convert_to_years(df, unit, multiplier):
    """
    input: Cook County dataset which includes "categorical_sentence" column added by processLifeSentences.
    output: Adds a column called 'sentence_period_years' to dataframe.
            For COMMITMENT_UNIT which matches 'unit', fill in sentence_period_years with the value in COMMITMENT_TERM * multiplier
            Leaves all other rows untouched.
    """

    if ("sentence_period_years" not in df.columns):
        all_years["sentence_period_years"] = np.NaN  # add column which will be updated with sentence period in years
        
    multiplied_units = df["COMMITMENT_TERM"].copy() * multiplier  # do for all rows including ones not matching 'unit'
    unit_mask = df["COMMITMENT_UNIT"] == unit

#     print(unit_mask.head())
    df.loc[unit_mask, "sentence_period_years"] = multiplied_units.loc[unit_mask]
    return df
    
In [162]:
if "sentence_period_years" not in select_columns:
    select_columns.append("sentence_period_years")
In [163]:
all_years["COMMITMENT_UNIT"].unique()
Out[163]:
array(['Year(s)', 'Days', 'Months', 'Weeks'], dtype=object)
In [164]:
all_years = convert_to_years(all_years, "Year(s)", 1)
all_years = convert_to_years(all_years, "Months", 1/12)
all_years = convert_to_years(all_years, "Weeks", 1/52)
all_years = convert_to_years(all_years, "Days", 1/365.25)
In [165]:
# Verify that no rows are dropped
len(duration_set_filtered_unit_term) - len(all_years)
Out[165]:
0
In [166]:
# Preview after all processing is done
all_years.tail(10)[select_columns]
Out[166]:
DISPOSITION_CHARGED_OFFENSE_TITLE UPDATED_OFFENSE_CATEGORY COMMITMENT_TYPE COMMITMENT_TERM COMMITMENT_UNIT categorical_sentence sentence_period_years
236111 CRIMINAL TRESPASS TO VEHICLES Possession of Stolen Motor Vehicle Cook County Department of Corrections 2.0 Days Prison 0.005476
236112 AGGRAVATED UNLAWFUL USE OF WEAPON UUW - Unlawful Use of Weapon Probation 2.0 Year(s) Probation 2.000000
236113 AGGRAVATED BATTERY Aggravated Battery Police Officer Probation 2.0 Year(s) Probation 2.000000
236114 AGG UUW/LOADED/NO FCCA/FOID UUW - Unlawful Use of Weapon Cook County Department of Corrections 1.0 Year(s) Prison 1.000000
236115 ATTEMPT POSSESSION OF A CONTROLLED SUBSTANCE Narcotics Cook County Department of Corrections 39.0 Days Prison 0.106776
236117 ESCAPE; FAILURE TO COMPLY WITH A CONDITION OF THE ELECTRONIC HOME MONITORING DETENTION PROGRAM Escape - Failure to Return Illinois Department of Corrections 2.0 Year(s) Prison 2.000000
236119 PUBLIC INDECENCY/EXPOSURE/3+ Other Offense Illinois Department of Corrections 2.0 Year(s) Prison 2.000000
236120 MFG/DEL CANNABIS/2.5-10 GRAMS Narcotics Illinois Department of Corrections 2.0 Days Prison 0.005476
236122 AGGRAVATED DRIVING UNDER THE INFLUENCE OF ALCOHOL Aggravated DUI Illinois Department of Corrections 1.0 Year(s) Prison 1.000000
236123 VIOLATION OF THE SEX OFFENDER REGISTRATION ACT Failure to Register as a Sex Offender Illinois Department of Corrections 3.0 Year(s) Prison 3.000000
In [167]:
# Merge no_duration_set (which contains rows with Life/Boot Camp) with all_years (which is the processed duration_set)

# First add 'sentence_period_years' column to no_duration_set and set all rows to NaN
no_duration_set.loc[:,'sentence_period_years'] = np.NaN

# merge no_duration_set and all_years
sentencing_processed = no_duration_set.append(all_years)
C:\Users\minur\Anaconda3\lib\site-packages\pandas\core\indexing.py:376: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
In [168]:
# Verify that all rows were added
(len(sentencing_processed)-len(all_years)) - len(no_duration_set)
Out[168]:
0

Finished Processing

In [169]:
# Processing statistics
len_o = len(sentencing_orig)
len_p = len(sentencing_processed)
len_dropped = len_o - len_p
print(f"{len_dropped} rows from the original dataset were dropped")
print(f"This amounts to {100*len_dropped/len_o:.2f}% of the original dataset")
5059 rows from the original dataset were dropped
This amounts to 2.14% of the original dataset
In [170]:
# Random check for errors in data processing (Take random rows from the processed dataset and compare the original columns to the original dataset)

# Reload sentencing_orig be sure we are comparing to the original data
sentencing_orig = pd.read_csv(csv_filepath,
                                  parse_dates=["DISPOSITION_DATE", "SENTENCE_DATE",
                                                 "INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE",
                                                 "ARREST_DATE", "ARRAIGNMENT_DATE", "RECEIVED_DATE"],
                                  index_col=0)

for i in range(1000):
    random_row_number_in_processed_data = np.random.randint(1,len(sentencing_processed) + 1) # max = len(sentencing_processed) = [1,231066]
    index_number_of_row = sentencing_processed.iloc[random_row_number_in_processed_data-1:random_row_number_in_processed_data].index[0]
    original_row = sentencing_orig.loc[index_number_of_row] # note we are indexing by ID now
    processed_row = sentencing_processed.loc[index_number_of_row]

    # drop categorical sentence since it is only in processed_row.
    compare_df = pd.DataFrame(original_row).T.append(pd.DataFrame(processed_row).T, sort=True).dropna(axis=1) #.drop(['categorical_sentence'], axis=1)  <- need this if sentencing_orig has been modified to include categorical_sentence
    compare_df.loc[:,"COMMITMENT_TERM"] = pd.to_numeric(compare_df.loc[:,"COMMITMENT_TERM"])
    try:
        assert np.all(compare_df.iloc[0]==compare_df.iloc[1]), "error - compare not equal"
    except:
        print(compare_df.iloc[0]==compare_df.iloc[1])
        print(f"i = {i}")
        assert False, "Just to break the loop so that I can see which values are different"

print("Success! All tested rows are equal to the original.")
Success! All tested rows are equal to the original.
In [171]:
sentencing_processed["RACE"].value_counts()
Out[171]:
Black                               153570
White [Hispanic or Latino]           35148
White                                32456
HISPANIC                              5744
Asian                                 1291
White/Black [Hispanic or Latino]      1123
Unknown                                324
American Indian                        121
ASIAN                                   65
Biracial                                33
Name: RACE, dtype: int64

Fix incorrect race

RACE Count
Black 153570
White [Hispanic or Latino] 35148
White 32456
HISPANIC 5744
Asian 1291
White/Black [Hispanic or Latino] 1123
Unknown 324
American Indian 121
ASIAN 65
Biracial 33
  • Merge [HISPANIC, White [Hispanic or Latino], White/Black [Hispanic or Latino]]
  • Merge [ASIAN, Asian]
In [172]:
# Replace race names
sentencing_processed = sentencing_processed.replace({'RACE':
                                                     {"White [Hispanic or Latino]": "Hispanic",
                                                      "HISPANIC": "Hispanic",
                                                      "White/Black [Hispanic or Latino]": "Hispanic",
                                                      "ASIAN": "Asian",
                                                      "710/410 Probation": "Probation"}})

# Fill race = NaN with 'Unknown'
missing_race = sentencing_processed["RACE"].isna()
sentencing_processed.loc[missing_race,"RACE"] = "Unknown"

Fix possibly incorrect sentence period

In [173]:
# Limit max sentence_period_years to 100 years. About 30 rows affected
limit_years_mask = sentencing_processed["sentence_period_years"] > 100
sentencing_processed.loc[limit_years_mask, "sentence_period_years"] =  100
# sentencing_processed.sort_values(by=["sentence_period_years"], ascending=False)[["sentence_period_years", "UPDATED_OFFENSE_CATEGORY", "categorical_sentence"]].head(50)

Fix possibly incorrect age at incident

In [175]:
# Replace age>100 with median age
median_age = np.median(sentencing_processed["AGE_AT_INCIDENT"].dropna())

limit_age_mask = sentencing_processed["AGE_AT_INCIDENT"] > 100
sentencing_processed.loc[limit_age_mask, "AGE_AT_INCIDENT"] =  median_age
In [179]:
np.max(sentencing_processed["AGE_AT_INCIDENT"])  # should be < 100
Out[179]:
86.0
In [180]:
# Save processed data to disk
sentencing_processed.to_csv("Sentencing_processed_data.csv")  # when reading, remember to set index_column=0

Appendix

Haven't cleaned this up but there is some useful information which should not be deleted.

List of possible errors in dataset

  1. CASE_ID 113570699133 - sentenced to death for posession of cannabis

Commitment term / unit processing

  1. Only keep rows with commitment unit of [Days, Months, Natural Life]
  2. Sentence can be:

    • a time in days, months or years
      • Time period can be taken from (COMMITMENT_TERM, COMMITMENT_UNIT) columns.
    • or one of three categorical outcomes:
      • Death (Ignore)
      • Life Sentence
      • Cook County Boot Camp
  3. Reason why 'Death' is ignored:

    • sentencing_orig[sentencing_orig.eq('Death').any(axis=1)]
    • 'Death' appears only 9 times under COMMITMENT_TYPE and 63 times under SENTENCE_TYPE.
    • Decided to ignore Death penalty because all of It appears to be clearly incorrect.
    • The two different ways of looking for Death sentences with some overlap in the examples gotten from each method (see below).
    • When Death appears under SENTENCE_TYPE:

      • sentencing_orig.query("SENTENCE_TYPE=='Death'")
      • This is likely incorrect because many of the charges are fairly minor (possesion of cannabis, DUI, robbery, etc.) and do not include murder which is mandatory according to https://deathpenaltyinfo.org/facts-and-research/crimes-punishable-by-death . I checked that the CASE_PARTICIPANT_ID does not also include other crimes (CASE_COUNT mostly 1) which could have explained the death penalty through the other charges (although this shouldn't really be a valid reason as each crime gets an individual sentence).
      • 61/63 examples include (COMMITMENT_TERM, COMMITMENT_UNIT) information that is a time period (Of the ones without a time period, 1 has (NaN, NaN) and one has (1, Term) instead)
    • When Death appears under SENTENCE_TYPE:

    • Since almost all of these examples (both types) include a time period and since the death penalty looks unlikely in many of these cases, I will ignore the death penalty and just take the time period in (COMMITMENT_TERM, COMMITMENT_UNIT)

  4. How to decide whether sentence is 'Natural Life':

    • "Life sentence' expressed in several different ways in this dataset:
      • COMMITMENT_UNIT=='Natural Life' = 712
      • COMMITMENT_TYPE=='Natural Life' and COMMITMENT_UNIT=='NaN'; count = 20
    • it seems you can have COMMITMENT_TYPE='Natural Life' or you can have COMMITMENT_UNIT='Natural Life'
  5. Assumption: For crimes that involve multiple charges, we assume that each conviction is sentenced independantly of the others and is neither influenced by the other nor is the maximum sentence for a single charge applied to each of the individual charges. The opposite of this can be easily proven by inspection (i.e. pick a CASE_ID that has multiple charges and you can find different sentences for each of the charges. Whether or not this holds for all CASE_IDs is not known but is likely from what I have read about how sentencing works.

  6. Drop all convictions that involve cannabis because the sentencing has likely changed over the years.

  7. Something to look at would be to see if the number of convictions for a single crime affects the sentence. To do this, replace the CHARGE_COUNT for a single CASE_ID, with the highest count OR manually count the number of charges for a single CASE_ID for each CASE_ID. Another thing to try would be to only take the charge which has PRIMARY_CHARGE=True and drop all of the other rows for the CASE_ID.

  • Categorical outcomes: Life sentence, Cook County Boot Camp

Todo

Column descriptions: https://datacatalog.cookcountyil.gov/Courts/Sentencing/tg8v-tm6u

Exploratory Data Analsis

  • Look at the data manually and try to find rare charges for which we have too little information to make predictions and remove these examples OR use only the top 10 charges and ignore the rest.
  • PLot a covariance matrix of sentence length vs each of the other variables. It will be interesting to see how things like case duration and age at time of sentence corellate to sentence length.
  • Plot Sankey diagrams of the following:
    • Left: what people were arrested, Right: what they are charged for.
    • Left: charge disposition title, right: sentence duration discretized to 5 year blocks (make sure the order on the right is correct).
    • Left: percentage of race by population of cook county/ Chicago, Right: percentage of race as represented in this dataset

Machine Learning:

  • Look at the proper way to determine a life sentence (i.e. which columns, etc) -> Use life_COMMITMENT_UNIT==True OR life_COMMITMENT_TYPE==True
  • Look at the proper way to determine a cook county boot camp (i.e. which columns, etc)
  • Figure out a conversion for the time for prison sentences (must convert days, months, years to a unique value)
  • How to deal with all of the possible charges (DISPOSITION_CHARGED_OFFENSE_TITLE) has ~1600 possible charges. See this.
  • Should I use classification or regression here? Even though a sentence is in a continuous unit (time), only a very few values seem to be used. For instance, in the worst case senario, there are only ~2000 unique sentences in this dataset of ~200,000 which is around 1% of unique cases. This data, however is ordinal and if we use a naive classification method, we will be losing information. A solution to this would be to use the method described here. If using classification, we could also simplify the problem and decrease weeor by binning any sentences that appear very close together (e.g. use 1 dimensional k-means on the log transform of the prediction (otherwise, you will end up with smaller cluster for the longer duration sentences because even if they change by 1 year, this will be much more than a sentences in the range of say 1 week and 90 days which appear to be more different to each other than the 1 year change for a 30 year sentence.
    • Using classification also solves the problem of dealing with life sentences and boot camp with the continuous time duration sentences.
    • Also try regression, again with log transformed predictions (because again, we care more about the relative differences in time periods). I think random forest regression would work much better than ridge/lasso because the targets are not continuously distributed.
  • For life sentence / boot camp:

    • try a fixed duration like 50 years for life sentence (look at whether Chicago has a limit on life sentence, if yes, use that limit)
    • [Use average life expectancy of a person in the US] - [age at arrest]
    • for boot camp try 0 years
  • See which offense type has the lowest R^2 value - this might be an indicator of crimes which may be the most inconsistently judged

  • Run several versions of the algorithm

    • One with everything except sentence length
    • One whose objective is to predict sentence given the true crime (which is established by the judge) but no other features that depend on the case itself (case duration, charge_disposition_title, charge_disposition, any other features that incluse *disposition*, SENTENCE_JUDGE, etc.)
    • One with only the information available at time of arrest (remove everything feature after arrest, including the features that were kept above)
  • Try using XGBoost

In [224]:
len(sentencing_orig)
Out[224]:
236124

Law facts

  • A crime may bring multiple charges and sentences, but usually only the longest sentence is served because of the 'No Double punishment rule'