#Import necessary libraries
import numpy as np
import pandas as pd
#Ask pandas to show all columns
pd.options.display.max_rows = 10000
pd.options.display.width = 1000


Incidence_1999_2008 = pd.read_csv('./US_cancer_1999_2008_Incidence.txt', sep="\t")
Incidence_2009_2018 = pd.read_csv('./US_cancer_2009_2018_Incidence.txt', sep="\t")
Mortality_1999_2008 = pd.read_csv('./US_cancer_1999_2008_Mortality.txt', sep="\t")
Mortality_2009_2018 = pd.read_csv('./US_cancer_2009_2018_Mortality.txt', sep="\t")
display(Incidence_1999_2008)
display(Incidence_2009_2018)
display(Mortality_1999_2008)
display(Mortality_2009_2018)


Incidence = pd.concat([Incidence_1999_2008,Incidence_2009_2018], ignore_index=True)
Mortality = pd.concat([Mortality_1999_2008,Mortality_2009_2018], ignore_index=True)
display(Incidence)
display(Mortality)


Incidence.dtypes

Notes                        float64
Leading Cancer Sites          object
Leading Cancer Sites Code     object
Race                          object
Race Code                     object
Year                           int64
Year Code                      int64
Sex                           object
Sex Code                      object
Age Groups                    object
Age Groups Code               object
Count                         object
dtype: object


Mortality.dtypes

Notes                        float64
Leading Cancer Sites          object
Leading Cancer Sites Code     object
Race                          object
Race Code                     object
Year                           int64
Year Code                      int64
Sex                           object
Sex Code                      object
Age Group                     object
Age Group Code                object
Deaths                        object
dtype: object


Incidence["Notes"].unique()

array([nan])


Mortality["Notes"].unique()

array([nan])


Incidence["Leading Cancer Sites"].unique()

array(['Brain and Other Nervous System', 'Breast', 'Cervix Uteri',
       'Colon and Rectum', 'Corpus Uteri', 'Esophagus', 'Gallbladder',
       'Kidney and Renal Pelvis', 'Larynx', 'Leukemias', 'Liver',
       'Lung and Bronchus', 'Melanoma of the Skin', 'Myeloma',
       'Non-Hodgkin Lymphoma', 'Oral Cavity and Pharynx', 'Ovary',
       'Pancreas', 'Prostate', 'Stomach', 'Thyroid',
       'Urinary Bladder, invasive and in situ'], dtype=object)


Mortality["Leading Cancer Sites"].unique()

array(['Brain and Other Nervous System', 'Breast', 'Cervix Uteri',
       'Colon and Rectum', 'Corpus Uteri', 'Esophagus', 'Gallbladder',
       'Kidney and Renal Pelvis', 'Larynx', 'Leukemias', 'Liver',
       'Lung and Bronchus', 'Melanoma of the Skin', 'Myeloma',
       'Non-Hodgkin Lymphoma', 'Oral Cavity and Pharynx', 'Ovary',
       'Pancreas', 'Prostate', 'Stomach', 'Thyroid', 'Urinary Bladder'],
      dtype=object)


Incidence = Incidence.replace('Urinary Bladder, invasive and in situ', 'Urinary Bladder')
Incidence


Incidence.drop(columns=["Notes", "Leading Cancer Sites Code", "Race Code", "Year Code", "Sex", "Age Groups"], inplace=True)
Mortality.drop(columns=["Notes", "Leading Cancer Sites Code", "Race Code", "Year Code", "Sex", "Age Group"], inplace=True)
Incidence.rename(columns={"Age Groups Code": "Age Group"}, inplace=True)
Mortality.rename(columns={"Age Group Code": "Age Group"}, inplace=True)
df = Incidence.merge(Mortality, on=["Leading Cancer Sites", "Race", "Year", "Sex Code", "Age Group"], how="outer")
df.replace({'1': 0, '1-4': 1, '5-9': 5, '10-14': 10, '15-19': 15, '20-24': 20, '25-29': 25, '30-34': 30,
       '35-39': 35, '40-44': 40, '45-49': 45, '50-54': 50, '55-59': 55, '60-64': 60, '65-69': 65,
       '70-74': 70, '75-79': 75, '80-84': 80, '85+': 85}, inplace=True)


df = df.replace({'Suppressed': 8})
df["Count"] = pd.to_numeric(df['Count'], errors="coerce")
df["Deaths"] = pd.to_numeric(df['Deaths'], errors="coerce")
df


df.groupby(["Leading Cancer Sites"])["Count"].sum().idxmax()

'Breast'


df.groupby(["Leading Cancer Sites"])["Deaths"].sum().idxmax()

'Lung and Bronchus'


df.groupby(["Leading Cancer Sites", "Sex Code"])["Count"].sum()
#Same likelihood

Leading Cancer Sites            Sex Code
Brain and Other Nervous System  F            193517.0
                                M            237384.0
Breast                          F           4399782.0
                                M             40018.0
Cervix Uteri                    F            258451.0
                                M                 0.0
Colon and Rectum                F           1418504.0
                                M           1512601.0
Corpus Uteri                    F            883689.0
                                M                 0.0
Esophagus                       F             72355.0
                                M            251454.0
Gallbladder                     F             53309.0
                                M             25440.0
Kidney and Renal Pelvis         F            390202.0
                                M            645503.0
Larynx                          F             54143.0
                                M            201049.0
Leukemias                       F            380629.0
                                M            515759.0
Liver                           F            113039.0
                                M            315737.0
Lung and Bronchus               F           1959879.0
                                M           2280589.0
Melanoma of the Skin            F            543773.0
                                M            756420.0
Myeloma                         F            193554.0
                                M            234840.0
Non-Hodgkin Lymphoma            F            584326.0
                                M            686504.0
Oral Cavity and Pharynx         F            228325.0
                                M            534189.0
Ovary                           F            431176.0
                                M                 0.0
Pancreas                        F            400669.0
                                M            410182.0
Prostate                        F                 0.0
                                M           4182788.0
Stomach                         F            173306.0
                                M            274333.0
Thyroid                         F            563402.0
                                M            187842.0
Urinary Bladder                 F            340095.0
                                M           1038584.0
Name: Count, dtype: float64


df.groupby(["Leading Cancer Sites"])["Count"].sum().plot.bar().set_title("Cancer cases by sites")

Text(0.5, 1.0, 'Cancer cases by sites')


df.groupby(["Leading Cancer Sites"])["Deaths"].sum().plot.bar().set_title("Cancer deaths by sites")

Text(0.5, 1.0, 'Cancer deaths by sites')


df[df["Race"] == "White"].groupby(["Year"])["Deaths"].sum().plot.line().set_title("Cancer deaths among white community over year ")

Text(0.5, 1.0, 'Cancer deaths among white community over year ')


income_all = pd.read_csv("./income_1967_2021.csv",)
#Slicing 1999-2018 only
income_df = income_all.loc[32:51]

#Convert strings to numeric
income_df_clean = income_df[["All races", "White, non-Hispanic", "Black", "Hispanic (any race)", "Asian"]].apply(lambda x: x.str.replace(',', ''), axis=1).apply(pd.to_numeric)
income_df_clean["Year"] = income_df["Year"]
#income_df_clean = income_df_clean[["Year", "All races", "White", "Black or African American", "Other Races and Unknown combined", "Asian or Pacific Islander"]]
income_df_clean = income_df_clean[["Year","All races", "White, non-Hispanic", "Black", "Hispanic (any race)", "Asian"]]
income_df_clean.set_index("Year")
income_df_clean.columns = ["Year", "All races", "White", "Black or African American", "Other Races and Unknown combined", "Asian or Pacific Islander"]
income_df_clean


df["Race"].unique()

array(['American Indian or Alaska Native', 'Asian or Pacific Islander',
       'Black or African American', 'White',
       'Other Races and Unknown combined'], dtype=object)

df


import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator,
                               FormatStrFormatter,
                               AutoMinorLocator)
races = ['American Indian or Alaska Native', 'Asian or Pacific Islander',
       'Black or African American', 'White',
       'Other Races and Unknown combined']
fig, axes = plt.subplots(1, 3, figsize=(21, 4))
axes[0].set_title("Total cancer counts by race in the US 1999-2018")
axes[1].set_title("Total cancer deaths by race in the US 1999-2018")
axes[2].set_title("Cancer death rate by race in the US 1999-2018")
axes[0].xaxis.set_major_locator(MultipleLocator(2))
axes[1].xaxis.set_major_locator(MultipleLocator(2))
axes[2].xaxis.set_major_locator(MultipleLocator(2))
for race in races:
    df[df["Race"] == race].groupby(["Year"])["Count"].sum().plot.line(ax = axes[0], label = race, legend=True)
    df[df["Race"] == race].groupby(["Year"])["Deaths"].sum().plot.line(ax = axes[1], label = race, legend=True)
    (df[df["Race"] == race].groupby(["Year"])["Deaths"].sum()/df[df["Race"] == race].groupby(["Year"])["Count"].sum()).plot.line(ax = axes[2], label = race, legend=True).set_ylabel("Death rate")


fig, ax = plt.subplots(1, 1, figsize=(8, 4))
ax.set_title("Median income by race in the US 1999-2018")
ax.set_ylabel("Annual income in dollars")
income_df_clean.plot.line(x="Year",ax = ax, label=race, legend=True).xaxis.set_major_locator(MultipleLocator(2))


white = df[df["Race"]=="White"]
fig, axes = plt.subplots(1, 3, figsize=(21, 4))
axes[0].set_title("Total cancer counts by age group among white community")
axes[1].set_title("Total cancer deaths by age group among white community")
axes[2].set_title("Cancer death rate by age group among white community")
white.groupby("Age Group")["Count"].sum().plot(ax=axes[0])
white.groupby("Age Group")["Deaths"].sum().plot(ax=axes[1])
(white.groupby("Age Group")["Deaths"].sum()/white.groupby("Age Group")["Count"].sum()).plot(ax=axes[2])

<AxesSubplot:title={'center':'Cancer death rate by age group among white community'}, xlabel='Age Group'>


black = df[df["Race"]=='Black or African American']
fig, axes = plt.subplots(1, 3, figsize=(21, 4))
axes[0].set_title("Total cancer counts by age group among black community")
axes[1].set_title("Total cancer deaths by age group among black community")
axes[2].set_title("Cancer death rate by age group among black community")
black.groupby("Age Group")["Count"].sum().plot(ax=axes[0])
black.groupby("Age Group")["Deaths"].sum().plot(ax=axes[1])
(black.groupby("Age Group")["Deaths"].sum()/black.groupby("Age Group")["Count"].sum()).plot(ax=axes[2])

<AxesSubplot:title={'center':'Cancer death rate by age group among black community'}, xlabel='Age Group'>


(df.groupby(["Leading Cancer Sites"])["Deaths"].sum()/df.groupby(["Leading Cancer Sites"])["Count"].sum()).plot.bar().set_title("Death rate among cancer sites")

Text(0.5, 1.0, 'Death rate among cancer sites')

df


#importing necessary tools from scikitlearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
# Define the features.
df["Death Rate"] = df["Deaths"]/(df["Count"]+0.01)
features = ["Leading Cancer Sites", "Race", "Sex Code",
            "Age Group"]
#Drop NaN
df.dropna(inplace=True)
# Define the training data.
# Represent the features as a list of dicts.
X_train_dict = df[features].to_dict(orient="records")
y_train = df["Death Rate"]

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)


# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)


#Performance test using cross validation
def KNN_MAE(k):
  vec = DictVectorizer(sparse=False)
  scaler = StandardScaler()
  model = KNeighborsRegressor(n_neighbors=k)
  pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])
  scores = cross_val_score(pipeline, X_train_dict, y_train, 
                         cv=5, scoring="neg_mean_absolute_error")
  MAE = np.mean(-scores)
  return MAE


MAE = []
for i in np.arange(1,50):
  MAE.append(KNN_MAE(i))

pd.DataFrame(MAE).plot.line(xlabel = "k", ylabel="error", legend=False)

<AxesSubplot:xlabel='k', ylabel='error'>


from sklearn.ensemble import RandomForestRegressor
def random_forest(k):
  vec = DictVectorizer(sparse=False)
  scaler = StandardScaler()
  model = RandomForestRegressor(n_estimators=k, max_features="auto", random_state=44)
  pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])
  scores = cross_val_score(pipeline, X_train_dict, y_train, 
                         cv=5, scoring="neg_mean_absolute_error")
  MAE = np.mean(-scores)
  return MAE


MAE2 = []
for i in np.arange(1,50):
  MAE2.append(random_forest(i))

pd.DataFrame(MAE2).plot.line(xlabel = "n_estimators", ylabel="error", legend=False)

<AxesSubplot:xlabel='n_estimators', ylabel='error'>


X_new_dict = [{
    "Leading Cancer Sites": "Liver",
    "Race": "White",
    "Sex Code": "F",
    "Age Group": 32
}]
X_new = vec.transform(X_new_dict)
X_new_sc = scaler.transform(X_new)
kNN_model = KNeighborsRegressor(n_neighbors=50)
kNN_model.fit(X_train_sc, y_train)
kNN_model.predict(X_new_sc)

array([0.57269629])


RF_model = RandomForestRegressor(n_estimators=9, max_features="auto", random_state=44)
RF_model.fit(X_train_sc, y_train)
RF_model.predict(X_new_sc)

array([0.66483548])


X_new_dict = [{
    "Leading Cancer Sites": "Prostate",
    "Race": "Black or African American",
    "Sex Code": "F",
    "Age Group": 45
}]
X_new = vec.transform(X_new_dict)
X_new_sc = scaler.transform(X_new)
kNN_model = KNeighborsRegressor(n_neighbors=50)
kNN_model.fit(X_train_sc, y_train)
kNN_model.predict(X_new_sc)

array([0.028965])


RF_model = RandomForestRegressor(n_estimators=9, max_features="auto", random_state=44)
RF_model.fit(X_train_sc, y_train)
RF_model.predict(X_new_sc)

array([0.02619615])

	Year	All races	White	Black or African American	Other Races and Unknown combined	Asian or Pacific Islander
32	1999	63423	68817	43497	47916	79419
33	2000	63292	68768	44718	49995	84043
34	2001	61889	67864	43191	49193	78607
35	2002	61190	67669	42098	47763	75439
36	2003	61113	67404	41885	46552	77964
37	2004	60901	67187	41534	47078	78917
38	2005	61553	67476	41128	47789	81114
39	2006	62033	67467	41353	48623	82237
40	2007	62865	68731	42664	48406	82442
41	2008	60624	66924	41392	45692	79020
42	2009	60200	65865	39608	46004	78699
43	2010	58627	64794	38258	44772	75582
44	2011	57732	63912	37331	44549	74965
45	2012	57623	64391	38084	44055	77010
46	2013	57808	64854	38704	45592	80661
47	2014	58725	67146	39021	44171	81897
48	2015	61748	65948	40646	46505	83867
49	2016	63683	68778	43217	49328	87180
50	2017	64806	70157	42865	51425	85491
51	2018	65127	72005	42977	52974	89491

United States Cancer Statistics 1999-2018¶

Final Data Science Tutorial by Thien Tran¶

1. Background¶

2. Project goal¶

3. Plan¶

4. Data preview¶

Extraction¶

Load¶

Transform and clean¶

Systematic statistics¶

5. Conclusion¶

	Notes	Leading Cancer Sites	Leading Cancer Sites Code	Race	Race Code	Year	Year Code	Sex	Sex Code	Age Groups	Age Groups Code	Count
0	NaN	Brain and Other Nervous System	31010-31040	American Indian or Alaska Native	1002-5	1999	1999	Female	F	< 1 year	1	Suppressed
1	NaN	Brain and Other Nervous System	31010-31040	American Indian or Alaska Native	1002-5	1999	1999	Female	F	1-4 years	1-4	Suppressed
2	NaN	Brain and Other Nervous System	31010-31040	American Indian or Alaska Native	1002-5	1999	1999	Female	F	5-9 years	5-9	Suppressed
3	NaN	Brain and Other Nervous System	31010-31040	American Indian or Alaska Native	1002-5	1999	1999	Female	F	10-14 years	10-14	Suppressed
4	NaN	Brain and Other Nervous System	31010-31040	American Indian or Alaska Native	1002-5	1999	1999	Female	F	15-19 years	15-19	Suppressed
...	...	...	...	...	...	...	...	...	...	...	...	...
41795	NaN	Urinary Bladder, invasive and in situ	29010	Other Races and Unknown combined	2131-1	2008	2008	Male	M	65-69 years	65-69	68
41796	NaN	Urinary Bladder, invasive and in situ	29010	Other Races and Unknown combined	2131-1	2008	2008	Male	M	70-74 years	70-74	63
41797	NaN	Urinary Bladder, invasive and in situ	29010	Other Races and Unknown combined	2131-1	2008	2008	Male	M	75-79 years	75-79	50
41798	NaN	Urinary Bladder, invasive and in situ	29010	Other Races and Unknown combined	2131-1	2008	2008	Male	M	80-84 years	80-84	32
41799	NaN	Urinary Bladder, invasive and in situ	29010	Other Races and Unknown combined	2131-1	2008	2008	Male	M	85+ years	85+	25