Exploratory Data Analysis - Cook County Sentencing Dataset

ℹ️ Please click here to view dynamic Plotly plots without having to run locally.
Code is hidden by default when viewing online. Click on "Show Code" button below to display code.

Note to self. Run this in Anaconda Prompt to generate notebook:
jupyter nbconvert Exploratory_data_analysis.ipynb

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import calendar
In [3]:
import missingno
import seaborn as sns
plt.style.use('ggplot')

import matplotlib.ticker as ticker
In [4]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
In [5]:
pd.options.display.max_columns = None  # to force pandas to display all columns

pd.options.display.max_rows = None  # to force pandas to display all rows.
# Warning: always remember to use .head() when looking at complete dataset to avoid displaying all rows.
In [6]:
# Read Cook County Sentencing Data after processing by Sentencing_data_cleaning.ipynb
sentencing_processed = pd.read_csv("Sentencing_processed_data.csv",
                                  parse_dates=["DISPOSITION_DATE", "SENTENCE_DATE",
                                                 "INCIDENT_BEGIN_DATE", "INCIDENT_END_DATE",
                                                 "ARREST_DATE", "ARRAIGNMENT_DATE", "RECEIVED_DATE"],
                                  index_col=0)
C:\Users\minur\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3051: DtypeWarning:

Columns (8,9,12,15) have mixed types. Specify dtype option on import or set low_memory=False.

In [7]:
sentencing = sentencing_processed.copy()

# rename American Indian, Unkown and Biracial into one group -> Other
sentencing = sentencing.replace({"RACE": {"American Indian": "Other/Unknown",
                                          "Unknown": "Other/Unknown",
                                          "Biracial": "Other/Unknown"}})
In [8]:
# Colour Palette

colour = {
    "asian":      "#1f77b4",
    "asian_2":    "#1f77b4",
    "black":      "#ff7f0e",
    "black_2":    "#ff7f0e",
    "hispanic":   "#9467bd",
    "hispanic_2": "#9467bd",
    "white":      "#2ca02c",
    "white_2":    "#2ca02c",
    "other":      "#e377c2",
    "other_2":    "#e377c2",
    
    "background_plot": '#ECECEC', #"#f2f2f2",
    "background_paper": "#f8f8f8",
}

Analysis on whole dataset

Distribution of Sentences by Time

Most of the examples are between 2010 and 2018.

In [9]:
#HIDDEN

fig = px.histogram(sentencing.query("ARREST_DATE > 2005").drop_duplicates(subset=["CASE_PARTICIPANT_ID"]), x="ARREST_DATE")
fig.update_layout(
    title="Arrest Count by Time",
    xaxis_title="Date",
    yaxis_title="Count",
    font=dict(
        size=12,
    )
)

fig.update_layout({
'plot_bgcolor': colour['background_plot'], # '#ECECEC',
'paper_bgcolor': colour["background_paper"] #'#F8F8F8',
})
fig.show()

The seasonality of crime is an interesting topic that has already been the subject of many studies. This report (PDF page 14), for instance, looks at the crime trends in Chicago during the 1970s. Although crime patterns may have changed over the past 50 years and this report looks at only specific kinds of offenses, it does show a characteristic dip in the Winter and a peak in the Summer that I have reproduced using my dataset below (Arrest Count).
Another interesting thing to check is the severity of the crimes commited by month (I have used the prison sentence duration as a proxy for this).

Both of these trends are plotted in the graph below.

In [10]:
sentencing["received_year"] = sentencing["ARREST_DATE"].apply(lambda x: x.year)
sentencing["received_month"] = sentencing["ARREST_DATE"].apply(lambda x: x.month)

# For Arrest Count
group_category = "received_month"
monthly_arrest_count = pd.DataFrame(sentencing.query("ARREST_DATE > 2005 & categorical_sentence=='Prison'").groupby([group_category])["CASE_ID"].count())
monthly_arrest_count = monthly_arrest_count.reset_index()
monthly_arrest_count = monthly_arrest_count.rename(columns={"CASE_ID": "Count"})
# normalize counts for each offense category
monthly_arrest_count = monthly_arrest_count.set_index(group_category)
monthly_arrest_count = monthly_arrest_count.reset_index()
monthly_arrest_count[group_category] = monthly_arrest_count[group_category].apply(lambda x: calendar.month_name[int(x)])

# For Mean Prison Sentence
monthly_mean_sentence = sentencing.query("ARREST_DATE > 2005 & categorical_sentence=='Prison'").groupby([group_category])["sentence_period_years"].mean().reset_index()
monthly_mean_sentence["received_month"] = monthly_mean_sentence["received_month"].apply(lambda x: calendar.month_name[int(x)])
monthly_mean_sentence = monthly_mean_sentence.rename(columns={"sentence_period_years": "Mean Prison Sentence Duration",
                                     "received_month": "Month"})
In [11]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=monthly_arrest_count["received_month"], y=monthly_arrest_count["Count"], name="Arrest Count",
                            marker=dict(color='#d62728')),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=monthly_mean_sentence["Month"], y=monthly_mean_sentence["Mean Prison Sentence Duration"], name="Mean Prison Sentence",
              marker=dict(color='#1f77b4')),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Monthly Trends in Crime",
    xaxis_showgrid= True,
    yaxis_showgrid= False
)

# Set x-axis title
fig.update_xaxes(title_text="Month")

# Set y-axes titles
fig.update_yaxes(title_text="Arrest Count", secondary_y=False, color='#d62728')
fig.update_yaxes(title_text="Mean Prison Sentence (Years)", secondary_y=True, color='#1f77b4')

fig.update_layout({
    'plot_bgcolor': colour["background_plot"],
    'paper_bgcolor': colour["background_paper"],
    })

fig.show()

Now let's have a look at the types of sentences given

Breakdowns by Sentence and Offense

The most frequent offenses are for narcotics (which includes marjuana related offenses), unlawful use of weapons and Aggravated DUI. The most frequent offenses that are usually classed as felonies (meaning at minimum 1 year jail sentence) are Burglary, Aggravated Battery on a Police Officer and Armed Robbery.

In [12]:
def plotHorizontalBar(df, column_name, print_column_name, title):
    """
    For a given categorical column, display a horizontal stacked bar chart 
    """

    categorical_sentence_counts = pd.DataFrame(df[column_name].value_counts())
    categorical_sentence_counts = categorical_sentence_counts.reset_index().rename(columns={'index': print_column_name, column_name: "Count"})

    df = px.data.tips()
    fig = px.bar(categorical_sentence_counts, x="Count", color=print_column_name, orientation='h',
    #              hover_data=["tip", "size"],
                 height=453,
                 title=title)
    fig.update_layout({
    'plot_bgcolor': colour["background_plot"],
    'paper_bgcolor': colour["background_paper"],
    'yaxis_visible':False,
    })
    fig.show()
In [13]:
plotHorizontalBar(sentencing, "UPDATED_OFFENSE_CATEGORY", "Offense Category", "Offense Category Breakdown")

The most common sentences by far are Prison and Probation, with all others only making up less than 10% all together.

In [14]:
plotHorizontalBar(sentencing, "categorical_sentence", "Sentence Category", "Sentence Category Breakdown")

Analysis by Race

Comparing Race Representation in this Dataset vs General Population

Black people are massively over represented in the sentencing dataset, while all other groups are under represented.
Blacks are represented three times more frequently in the dataset than in the general population.
The lowest representation in the dataset is for Asians who occur 10 times less frequently than you would expect by their proportion to the population. Whites are also appear less frequently in the dataset, by about 3 times less frequently than in the general population. Hispanics are only slightly underrepresented compared to their population frequency.

In [15]:
cook_county_population = 5_194_675
race_percentages = {"Population Count": np.array([.554, .248, .24, .062, 0])*cook_county_population, "Sentencing Count": [32456, 153570, 42015, 1356, 1668]}
populations = pd.DataFrame(data = race_percentages, index=["White", "Black", "Hispanic", "Asian", "Other/Unknown"])
# Note: dropping other races

# normalize columns to percentages
populations.loc[:,"Population Percentage"] = 100*populations["Population Count"]/(populations["Population Count"].sum())
populations.loc[:,"Sentencing Percentage"] = 100*populations["Sentencing Count"]/(populations["Sentencing Count"].sum())
In [16]:
populations
Out[16]:
Population Count Sentencing Count Population Percentage Sentencing Percentage
White 2877849.95 32456 50.181159 14.046264
Black 1288279.40 153570 22.463768 66.461818
Hispanic 1246722.00 42015 21.739130 18.183195
Asian 322069.85 1356 5.615942 0.586848
Other/Unknown 0.00 1668 0.000000 0.721875
In [17]:
import plotly.graph_objects as go
columns_to_get = ["Population Percentage", "Sentencing Percentage"]
white = populations.loc["White",columns_to_get]
black = populations.loc["Black",columns_to_get]
hispanic = populations.loc["Hispanic",columns_to_get]
asian = populations.loc["Asian",columns_to_get]
other = populations.loc["Other/Unknown",columns_to_get]


fig = go.Figure(data=[
    go.Bar(name='White',
           y=white,
           x=columns_to_get,
           orientation='v',
           marker=dict(
               color=colour["white"],
#                line=dict(color=colour["white_2"], width=3)
           )),
    go.Bar(name='Black',
           y=black,
           x=columns_to_get,
           orientation='v',
           marker=dict(
               color=colour["black"],
#                line=dict(color=colour["black_2"], width=3)
           )),
#     go.Bar(name='Black', y=columns_to_get, x=black, orientation='h'),
    go.Bar(name='Hispanic',
           y=hispanic,
           x=columns_to_get,
           orientation='v',
           marker=dict(
               color=colour["hispanic"],
#                line=dict(color=colour["hispanic_2"], width=3)
           )),
    go.Bar(name='Asian',
           y=asian,
           x=columns_to_get,
           orientation='v',
              marker=dict(
               color=colour["asian"],
#                line=dict(color=colour["asian_2"], width=3)
           )),
    go.Bar(name='Other/Unknown',
       y=other,
       x=columns_to_get,
       orientation='v',
          marker=dict(
           color=colour["other"],
#                line=dict(color=colour["asian_2"], width=3)
       ))
    
])
# Change the bar mode
# fig.update_layout(barmode='stack')
fig.update_layout(barmode='stack', title_text="Percentage of Race in Population and Sentencing ", font_size=12, height=500, width = 600)
# fig.layout.template = 'ggplot2'

fig.update_layout({
'plot_bgcolor': colour["background_plot"],
'paper_bgcolor': colour["background_paper"],
})
fig.show()


# Background #ecf0f1
# f19c79, c68064
# D4E09B, AEB87F
# f4f097, c8c57c
# bcd7dd, 9ab0b5

Are Some Races Convicted of Certain Crimes More Frequently than Others?

This is a plot of how much more frequently a person of the stated race is sentenced for a particular offense compared to the frequency for all other races.
For instance, Asians are more frequently convicted of Theft and Sex Crimes than other races while Blacks are more frequently involved in the unlawful use of weapons and homicide which could be explained by the over representation of Blacks in Chicago area gangs. An stark outlier is the frequency of Aggravated DUI among Hispanics. I could not find any information online that might explain this disparity so the high frequency may be due to a incomplete / incorrect data.

In [18]:
race_dataset = pd.DataFrame({"Asian": [],
                             "Black": [],
                             "Hispanic": [],
                             "White": [],
                             "Other/Unknown": []})

other_race_dataset = pd.DataFrame({"Asian": [],
                                   "Black": [],
                                   "Hispanic": [],
                                   "White": [],
                                   "Other/Unknown": []})

temp_counts = pd.DataFrame(sentencing["UPDATED_OFFENSE_CATEGORY"].value_counts()).rename(columns={"UPDATED_OFFENSE_CATEGORY": "Count"})

race_dataset = pd.concat([race_dataset, temp_counts.copy()])
race_dataset = race_dataset.drop(columns="Count");

other_race_dataset = pd.concat([other_race_dataset, temp_counts.copy()])
other_race_dataset = other_race_dataset.drop(columns="Count");
C:\Users\minur\Anaconda3\lib\site-packages\ipykernel_launcher.py:15: FutureWarning:

Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.


C:\Users\minur\Anaconda3\lib\site-packages\ipykernel_launcher.py:18: FutureWarning:

Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.


In [19]:
# count number percentage of sentence_category for each race and percentage for the remaining races, then subtract
variable = "UPDATED_OFFENSE_CATEGORY"
list_races = set(sentencing["RACE"].unique())


for race in list_races:
    race_filter = sentencing.query("RACE == @race")
    race_total = len(race_filter)
    other_race_filter = sentencing.query("RACE != @race")
    other_race_total = len(other_race_filter)
    
    race_frequency = race_filter["UPDATED_OFFENSE_CATEGORY"].value_counts() / race_total
    other_race_frequency = other_race_filter["UPDATED_OFFENSE_CATEGORY"].value_counts() / other_race_total
    
    race_dataset.loc[race_frequency.index, race] = race_frequency.values.flatten()
    other_race_dataset.loc[other_race_frequency.index, race] = other_race_frequency.values.flatten()
In [20]:
plot_df = (race_dataset - other_race_dataset)/other_race_dataset
plot_df = plot_df.reset_index()
plot_df = plot_df.fillna(0)
plot_df = plot_df.loc[:15,:]
In [21]:
# sns.set(style="whitegrid")
sns.set_style("dark")
sns.set(font_scale=1.25)
# Load the dataset
# crashes = sns.load_dataset("car_crashes")

plot_range = max([np.abs(plot_df.max()[1:].max()), np.abs(plot_df.min()[1:].min())])

# Make the PairGrid
g = sns.PairGrid(plot_df,
                 x_vars=plot_df.head().columns[1:],
                 y_vars="index",
                 height=6, aspect=.4)

# Draw a dot plot using the stripplot function
g.map(sns.stripplot, size=10, orient="h",
      palette="ch:s=1,r=-.1,h=1_r", linewidth=1, edgecolor="w")

# Use the same x axis limits on all columns and add better labels
g.set(xlim=(-plot_range, plot_range), xlabel="Fraction of Occurence \n for other Races", ylabel="")

# Use semantically meaningful titles for the columns
titles = plot_df.head().columns[1:]
for ax, title in zip(g.axes.flat, titles):

    # Set a different title for each axes
    ax.set(title=title)

    # Make the grid horizontal instead of vertical
    ax.xaxis.grid(True)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(2.5))
    ax.grid(b=True, which='major', color='w', linewidth=2.0)
#     ax.xaxis.set_major_locator(ticker.MultipleLocator(2))
#     ax.grid(b=True, which='major', color='b', linewidth=1.0)
    ax.yaxis.grid(True)
# sns.despine(left=True, bottom=True)

How does the Offense Category affect the Sentence Category?

This is a Sankey diagram showing how the number of incidents flow between Race, Offense and Sentence type.
I have grouped together a bunch of offenses into 'Other Most Serious Felonies' and 'Other' (excluding the most serious felonies) to avoid making the chart too cluttered.
You can hover over and click on the flow lines to get the count associated with that flow.

In [22]:
# Group Offenses

sentencing_recategorized = sentencing.copy()
serious_felonies = ["Armed Robbery", "Aggravated Battery Police Officer"
                    "Aggravated Robbery" , "Aggravated Battery With A Firearm",
                    "Attempt Homicide", "Attempt Armed Robbery", "Sex Crimes"
                    "Kidnapping", "Human Trafficking", "Child Abduction"]

serious_felonies_mask = sentencing_recategorized["UPDATED_OFFENSE_CATEGORY"].isin(serious_felonies)
sentencing_recategorized.loc[serious_felonies_mask, "UPDATED_OFFENSE_CATEGORY"] = "Other Most Serious Felonies"


select_categories = ["Narcotics", "UUW - Unlawful Use of Weapon",
                     "Homicide", "Other Most Serious Felonies"]

select_categories_mask = sentencing_recategorized["UPDATED_OFFENSE_CATEGORY"].isin(select_categories)
sentencing_recategorized.loc[~select_categories_mask, "UPDATED_OFFENSE_CATEGORY"] = "Other"
In [23]:
# Group low count sentence types
other_non_prison = ["Intensive Probation Services", "Conditional Discharge",
                   "Boot Camp", "Court Supervision",
                   "Intensive Drug Probation Services", "Drug Court Probation",
                  "Gang Probation", "Sex Offender Probation", "Drug School",
                  "Juvenile IDOC"]
non_prison_mask = sentencing_recategorized["categorical_sentence"].isin(other_non_prison)
sentencing_recategorized.loc[non_prison_mask, "categorical_sentence"] = "Other Non-Prison"
In [24]:
sankey_data = sentencing_recategorized.loc[:,["RACE", "UPDATED_OFFENSE_CATEGORY", "categorical_sentence", "CHARGE_ID"]]
sankey_data = sankey_data.groupby(["RACE", "UPDATED_OFFENSE_CATEGORY", "categorical_sentence"]).agg('count')

sankey_data = sankey_data.reset_index()
sankey_data = sankey_data.rename(columns={"CHARGE_ID":"count"})
In [25]:
# Code taken from internet: https://medium.com/kenlok/how-to-create-sankey-diagrams-from-dataframes-in-python-e221c1b4d6b0

def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
    # maximum of 6 value cols -> 6 colors
    colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp
        
    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))
    
    # define colors based on number of levels
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        colorList = colorList + [colorPalette[idx]]*colorNum
        
    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','count']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','count']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
        
    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
    
    # creating the sankey diagram
    data = dict(
        type='sankey',
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.5
          ),
          label = labelList,
          color = colorList
        ),
        link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['count']
        )
      )
    
    layout =  dict(
        title = title,
        font = dict(
          size = 10
        )
    )
       
    fig = dict(data=[data], layout=layout)
    return fig
In [28]:
sankey_data_transformed = genSankey(sankey_data,cat_cols=['RACE','UPDATED_OFFENSE_CATEGORY', 'categorical_sentence'],value_cols='count',title='Word Etymology')

fig = go.Figure(data = [go.Sankey(node = sankey_data_transformed['data'][0]['node'],
                             link = sankey_data_transformed['data'][0]['link'])])
fig.update_layout(title_text="Race, Offense and Sentence Category", font_size=10, height=600, width = 900)
fig.layout.template = 'ggplot2'
fig.show()

Age at Time of Incident

For all races, the most frequent age at time of incident is between 20-35 years, with the frequency steadily dropping with age.
The distribution among Blacks differs from that of other races in that the peak occurs in the early 20's and also there is a much higher proportion of incidents among teenagers than for other races.

In [29]:
# fig = make_subplots(rows=3, cols=2)

fig = make_subplots(rows=2,
                    cols=3,
                   subplot_titles=("Asian","Black", "",  "Hispanic", "White", "Other/Unkown"))
age_asian = go.Histogram(x=sentencing.query("RACE == 'Asian'")["AGE_AT_INCIDENT"],
                        xbins=dict(start=0, end=100, size=5), name="Asian",
                        marker=dict(color=colour["asian"]))
age_black = go.Histogram(x=sentencing.query("RACE == 'Black'")["AGE_AT_INCIDENT"],
                         xbins=dict(start=0, end=100, size=5), name="Black",
                         marker=dict(color=colour["black"]))

age_hispanic = go.Histogram(x=sentencing.query("RACE == 'Hispanic'")["AGE_AT_INCIDENT"],
                            xbins=dict(start=0, end=100, size=5), name="Hispanic",
                            marker=dict(color=colour["hispanic"]))

age_white = go.Histogram(x=sentencing.query("RACE == 'White'")["AGE_AT_INCIDENT"],
                         xbins=dict(start=0, end=100, size=5), name="White",
                         marker=dict(color=colour["white"]))

age_other = go.Histogram(x=sentencing.query("RACE == 'Other/Unknown'")["AGE_AT_INCIDENT"],
                         xbins=dict(start=0, end=100, size=5), name="Other/Unknown",
                         marker=dict(color=colour["other"]))


fig.append_trace(age_asian, 1, 1)
fig.append_trace(age_black, 1, 2)
fig.append_trace(age_hispanic, 2, 1)
fig.append_trace(age_white, 2, 2)
fig.append_trace(age_other, 2, 3)
fig.update_layout(
    margin=dict(l=20, r=20, t=20, b=20),
    plot_bgcolor = colour["background_plot"],
    paper_bgcolor = colour["background_paper"],
    showlegend=False
)

fig.show()

Duration of Prison Sentences

Note: Histogram is clipped after 10 years since there are proportionally very few instaces with longer durations which makes them hard to see.

When considering only prison sentences, the most frequent duration for all races is 3 years (1 year bins). The only clear difference is that the distribution is much more spread out for Blacks with both a greater frequency of sentences below 2 years and greater than 3 years.

In [30]:
fig = make_subplots(rows=2,
                    cols=3,
                   subplot_titles=("Asian","Black", "", "Hispanic", "White", "Other/Unkown"))

age_asian = go.Histogram(x=sentencing.query("RACE == 'Asian'")["sentence_period_years"],
                        xbins=dict(start=0, end=10, size=1), name="Asian",
                        marker=dict(color=colour["asian"]))

age_black = go.Histogram(x=sentencing.query("RACE == 'Black'")["sentence_period_years"],
                         xbins=dict(start=0, end=10, size=1), name="Black",
                         marker=dict(color=colour["black"]))

age_hispanic = go.Histogram(x=sentencing.query("RACE == 'Hispanic'")["sentence_period_years"],
                            xbins=dict(start=0, end=10, size=1), name="Hispanic",
                            marker=dict(color=colour["hispanic"]))

age_white = go.Histogram(x=sentencing.query("RACE == 'White'")["sentence_period_years"],
                         xbins=dict(start=0, end=10, size=1), name="White",
                         marker=dict(color=colour["white"]))

age_other = go.Histogram(x=sentencing.query("RACE == 'Other/Unknown'")["sentence_period_years"],
                         xbins=dict(start=0, end=10, size=1), name="Other/Unknown",
                         marker=dict(color=colour["other"]))

# fig.set(ylim=(1, 10000),ylabel="")

# fig.set(xlim=(0, 100), xlabel="a", ylabel="")
fig.append_trace(age_asian, 1, 1)
fig.append_trace(age_black, 1, 2)
fig.append_trace(age_hispanic, 2, 1)
fig.append_trace(age_white, 2, 2)
fig.append_trace(age_other, 2, 3)
fig.update_layout(
    margin=dict(l=20, r=20, t=20, b=20),
    plot_bgcolor = colour["background_plot"],
    paper_bgcolor = colour["background_paper"],
    showlegend=False
)

fig.show()

There is no significant change in the frequency of occurency by race over the time period covered in this dataset. The frequency of incidents is roughly the same among all races at a given time.

In [31]:
sentencing["received_year"] = sentencing["ARREST_DATE"].apply(lambda x: x.year)
sentencing["received_month"] = sentencing["ARREST_DATE"].apply(lambda x: x.month)

group_category = "RACE"
temp_time = pd.DataFrame(sentencing.query("ARREST_DATE > 2005").groupby([group_category, "received_year"])["CASE_ID"].count())
temp_time = temp_time.reset_index()
temp_time = temp_time.rename(columns={"CASE_ID": "Count"})

# normalize counts for each offense category
temp_time = temp_time.set_index(group_category)
totals = sentencing.query("ARREST_DATE > 2005")[group_category].value_counts()
for o in sentencing[group_category].unique():
    temp_time.loc[o, "Count"] = temp_time.loc[o, "Count"]/totals[o]
temp_time = temp_time.reset_index()
In [32]:
fig = px.line(temp_time, x="received_year", y="Count", color=group_category)

fig.update_layout(
    title="Arrest Count by Year (Normalized by Race)",
    xaxis_title="Year",
    yaxis_title="Normalized Arrest Count",
    font=dict(
#         family="Courier New, monospace",
        size=12,
#         color="#7f7f7f"
    )
)

fig.update_layout({
'plot_bgcolor': colour['background_plot'], # '#ECECEC',
'paper_bgcolor': colour["background_paper"] #'#F8F8F8',
})
fig.show()