import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols

/opt/conda/lib/python3.9/site-packages/statsmodels/compat/pandas.py:65: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import Int64Index as NumericIndex


income_per_capita = pd.read_csv('https://opendata.maryland.gov/resource/q4mi-9fr9.csv')
income_per_capita.head()


poverty_rate = pd.read_csv('https://opendata.maryland.gov/resource/iudf-4y2j.csv')
poverty_rate.head()


median_income = pd.read_csv('https://opendata.maryland.gov/resource/bvk4-qsxs.csv')
median_income.head()


income_per_capita.isna().sum()

date_created              0
year                      0
maryland                  0
allegany_county           0
anne_arundel_county       0
baltimore_city            0
baltimore_county          0
calvert_county            0
caroline_county           0
carroll_county            0
cecil_county              0
charles_county            0
dorchester_county         0
frederick_county          0
garrett_county            0
harford_county            0
howard_county             0
kent_county               0
montgomery_county         0
prince_george_s_county    0
queen_anne_s_county       0
somerset_county           0
st_mary_s_county          0
talbot_county             0
washington_county         0
wicomico_county           0
worcester_county          0
dtype: int64


poverty_rate.isna().sum()

date_created              0
year                      0
estimate                  0
maryland                  0
allegany_county           0
anne_arundel_county       0
baltimore_city            0
baltimore_county          0
calvert_county            0
caroline_county           0
carroll_county            0
cecil_county              0
charles_county            0
dorchester_county         0
frederick_county          0
garrett_county            0
harford_county            0
howard_county             0
kent_county               0
montgomery_county         0
prince_george_s_county    0
queen_anne_s_county       0
somerset_county           0
st_mary_s_county          0
talbot_county             0
washington_county         0
wicomico_county           0
worcester_county          0
dtype: int64


median_income.isna().sum()

date_created              0
year                      0
data                      0
maryland                  0
allegany_county           0
anne_arundel_county       0
baltimore_city            0
baltimore_county          0
calvert_county            0
caroline_county           0
carroll_county            0
cecil_county              0
charles_county            0
dorchester_county         0
frederick_county          0
garrett_county            0
harford_county            0
howard_county             0
kent_county               0
montgomery_county         0
prince_george_s_county    0
queen_anne_s_county       0
somerset_county           0
st_mary_s_county          0
talbot_county             0
washington_county         0
wicomico_county           0
worcester_county          0
dtype: int64


# dropping data from the income per capita table
income_per_capita = income_per_capita.drop(columns=['date_created'])
income_per_capita = income_per_capita.set_index('year')
income_per_capita.head()


# dropping data from the poverty rate table
poverty_rate = poverty_rate.loc[poverty_rate['estimate'] == 'Poverty Rate']
poverty_rate = poverty_rate.drop(columns=['date_created', 'estimate'])
poverty_rate = poverty_rate.set_index('year')
poverty_rate.head()


# dropping data from the median income table
median_income = median_income.loc[median_income['data'] == 'Income']
median_income = median_income.drop(columns=['date_created', 'data'])
median_income = median_income.set_index('year')
median_income.head()


# combining all county data

# getting the years that we are analyzing
years = poverty_rate.index

# getting all the counties in Maryland
counties = income_per_capita.columns
counties = counties[1:]

all_data = pd.DataFrame()
index = [years, counties]
index = pd.MultiIndex.from_product(index, names = ['years', 'county'])
per_capita = []
median = []
poverty = []
for year in years:
    per_capita.extend(income_per_capita.loc[year][1:].values)
    median.extend(median_income.loc[year][1:].values)
    poverty.extend(poverty_rate.loc[year][1:].values)
all_data['income_per_capita'] = per_capita
all_data['median_income'] = median
all_data['poverty_rate'] = poverty
all_data = all_data.set_index(index)
all_data.head()


# extracting the income per capita and median income for maryland
maryland_per_capita = income_per_capita['maryland']
maryland_median = median_income['maryland']

# setting the size of the graph
plt.figure(figsize=(10,10))
           
# plotting the income graph
plt.plot(years, maryland_per_capita, label="Income Per Capita")
plt.plot(years, maryland_median, label="Median Income")
plt.title('Income Graph of Maryland from 2010 to 2019')
plt.xlabel('Year')
plt.ylabel('Income')
plt.legend()
plt.show()


# plotting the graph
plt.figure(figsize=(15,10))
for county in counties:
    median = all_data.groupby(['county']).get_group(county)['median_income']
    plt.plot(years, median, marker = 'o', label=county)
plt.title('Median Income Graph of Counties from 2010 to 2019')
plt.xlabel('Year')
plt.ylabel('Income')
plt.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol = 5)
plt.show()


plt.figure(figsize=(15,10))
for county in counties:
    per_capita = all_data.groupby(['county']).get_group(county)['income_per_capita']
    plt.plot(years, per_capita, marker = 'o', label=county)
plt.title('Income Per Capita Graph of Counties from 2010 to 2019')
plt.xlabel('Year')
plt.ylabel('Income')
plt.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol = 5)
plt.show()


# plotting a graph for each county
for year in years:
   # extracting the income per capita and median income for the county
    per_capita = all_data.groupby(['years']).get_group(year)['income_per_capita']
    median = all_data.groupby(['years']).get_group(year)['median_income']

    # setting the size of the graph
    plt.figure(figsize=(10,10))

    # plotting the income graph
    plt.plot(counties, per_capita, label="Income Per Capita")
    plt.plot(counties, median, label="Median Income")
    plt.title('Income Graph in ' + str(year))
    plt.xlabel('County')
    plt.xticks(rotation = 90)
    plt.ylabel('Income')
    plt.legend()
    plt.show()


# extracting the maryland poverty rate
maryland_poverty_rate = poverty_rate['maryland']

# setting the size of the graph
plt.figure(figsize=(10,10))

# plotting the income graph
plt.plot(years, maryland_poverty_rate)
plt.title('Graph of Maryland Poverty Rate from 2010 to 2019')
plt.xlabel('Year')
plt.ylabel('Poverty Rate')
plt.show()


# setting the size of the graph
plt.figure(figsize=(10,10))

# plotting the income graph
plt.plot(counties, poverty_rate.loc[years[-1]][1:])
plt.title('Graph of Poverty Rate by County')
plt.xlabel('County')
plt.xticks(rotation=90)
plt.ylabel('Poverty Rate')
plt.show()


# getting the year index values into a 2d array
x = maryland_median.index
y = maryland_median

m, c = np.polyfit(x,y, deg=1)

plt.figure(figsize=(10,10))
plt.plot(x, y, 'o', x, m*x + c)
plt.xlabel('Year')
plt.ylabel('Median Income')
plt.title('Predicted Graph of Median Income')
plt.show()


print('The slope, m is ' + str(m) +' and the intercept, c is ' + str(c))

The slope, m is 1933.1151515151494 and the intercept, c is -3818109.2727272687


m*2022+c

90649.56363636348


# creating our model
per_capita_model = linear_model.LinearRegression()
median_model = linear_model.LinearRegression()

x1 = np.array(all_data['income_per_capita']).reshape(len(all_data['income_per_capita']), 1)
x2 = np.array(all_data['median_income']).reshape(len(all_data['median_income']), 1)
y = np.array(all_data['poverty_rate']).reshape(len(all_data['poverty_rate']), 1)

# fitting the data into our model
per_capita_model.fit(x1, y)
median_model.fit(x2, y)

LinearRegression()


plt.figure(figsize=(10,10))
plt.plot(all_data['income_per_capita'], all_data['poverty_rate'], 'o')

# using the model to predict the values
predicted = per_capita_model.predict(x1)
plt.plot(all_data['income_per_capita'], predicted)

plt.xlabel('Income per capita')
plt.ylabel('Poverty Rate')
plt.title('Poverty Rate vs Income Per Capita')
plt.show()

results = ols(formula = 'income_per_capita ~ poverty_rate', data = all_data).fit()
results.summary()


# plotting the model and getting the results
plt.figure(figsize=(10,10))
plt.plot(all_data['median_income'], all_data['poverty_rate'], 'o')

# using the model to predict the values
predicted = median_model.predict(x2)
plt.plot(all_data['median_income'], predicted)

plt.xlabel('Median Income')
plt.ylabel('Poverty Rate')
plt.title('Poverty Rate vs Median Income')
plt.show()

results = ols(formula = 'median_income ~ poverty_rate', data = all_data).fit()
results.summary()

	date_created	year	maryland	allegany_county	anne_arundel_county	baltimore_city	baltimore_county	calvert_county	caroline_county	carroll_county	...	kent_county	montgomery_county	prince_george_s_county	queen_anne_s_county	somerset_county	st_mary_s_county	talbot_county	washington_county	wicomico_county	worcester_county
0	September 29, 2020	2010	52251	33436	55360	39699	51519	52421	36012	50397	...	47491	74028	42782	51842	26854	49021	59672	38740	36230	46449
1	September 29, 2020	2011	53432	33891	56884	40923	51530	53383	37370	51574	...	48498	76529	43336	53009	26897	49897	60929	39419	36134	47180
2	September 29, 2020	2012	53547	33946	57182	40744	51982	53326	38773	51859	...	48590	76901	42842	53617	26830	49300	60548	39822	35419	48977
3	September 29, 2020	2013	52352	34049	56537	41156	51151	52177	39330	51601	...	48917	72577	42140	53209	27756	48499	60864	39646	35649	48894
4	September 29, 2020	2014	53170	34808	57551	42857	52331	52948	39790	52960	...	50625	72746	42425	54075	28881	49133	62278	40548	37043	49840

	date_created	year	estimate	maryland	allegany_county	anne_arundel_county	baltimore_city	baltimore_county	calvert_county	caroline_county	...	kent_county	montgomery_county	prince_george_s_county	queen_anne_s_county	somerset_county	st_mary_s_county	talbot_county	washington_county	wicomico_county	worcester_county
0	2020-09-29T00:00:00.000	2010	Poverty Rate	9.9	17.1	6.6	24.7	8.2	6.2	13.0	...	14.2	7.5	9.4	7.3	19.3	7.5	9.7	11.4	16.6	10.6
1	2020-09-29T00:00:00.000	2010	MOE	0.3	3.0	1.1	1.8	1.2	1.4	2.8	...	3.1	0.8	1.2	1.7	5.5	1.9	2.2	2.1	2.8	2.7
2	2020-09-29T00:00:00.000	2011	Poverty Rate	10.2	19.1	6.5	24.5	9.6	6.1	13.1	...	13.9	6.7	9.4	8.7	26.2	8.6	10.8	11.8	17.7	13.0
3	2020-09-29T00:00:00.000	2011	MOE	0.3	3.4	1.2	1.7	1.2	1.5	2.7	...	3.4	0.8	1.0	1.7	6.0	1.8	2.2	1.7	2.6	2.6
4	2022-04-08T00:00:00.000	2012	Poverty Rate	10.4	18.1	6.3	24.5	9.7	7.0	15.7	...	14.0	6.6	10.3	8.2	29.6	8.4	9.7	13.7	16.7	11.1

	date_created	year	data	maryland	allegany_county	anne_arundel_county	baltimore_city	baltimore_county	calvert_county	caroline_county	...	kent_county	montgomery_county	prince_george_s_county	queen_anne_s_county	somerset_county	st_mary_s_county	talbot_county	washington_county	wicomico_county	worcester_county
0	September 29, 2020	2010	Income	68933	37083	80908	38186	62300	86536	55480	...	49017	88559	69524	78503	38134	81559	56806	51610	47702	55492
1	September 29, 2020	2010	MOE	833	2826	2311	1414	2006	5064	2965	...	4582	2710	1609	5181	2747	5070	3948	3327	3097	3507
2	September 29, 2020	2011	Income	70075	38504	82980	38478	62309	88406	50809	...	49795	92288	70114	75158	35426	80943	55145	52028	45788	48472
3	September 29, 2020	2011	MOE	760	2693	3430	1536	1728	4369	4213	...	4603	2758	1911	6363	3426	2717	4929	2928	3582	4653
4	September 29, 2020	2012	Income	71169	38670	87083	39077	62413	87215	48772	...	49969	94365	69258	79012	34454	85478	61529	52604	50204	55875

	maryland	allegany_county	anne_arundel_county	baltimore_city	baltimore_county	calvert_county	caroline_county	carroll_county	cecil_county	charles_county	...	kent_county	montgomery_county	prince_george_s_county	queen_anne_s_county	somerset_county	st_mary_s_county	talbot_county	washington_county	wicomico_county	worcester_county
year
2010	52251	33436	55360	39699	51519	52421	36012	50397	39607	49941	...	47491	74028	42782	51842	26854	49021	59672	38740	36230	46449
2011	53432	33891	56884	40923	51530	53383	37370	51574	40235	50714	...	48498	76529	43336	53009	26897	49897	60929	39419	36134	47180
2012	53547	33946	57182	40744	51982	53326	38773	51859	40299	50023	...	48590	76901	42842	53617	26830	49300	60548	39822	35419	48977
2013	52352	34049	56537	41156	51151	52177	39330	51601	40262	49016	...	48917	72577	42140	53209	27756	48499	60864	39646	35649	48894
2014	53170	34808	57551	42857	52331	52948	39790	52960	40944	49208	...	50625	72746	42425	54075	28881	49133	62278	40548	37043	49840

	maryland	allegany_county	anne_arundel_county	baltimore_city	baltimore_county	calvert_county	caroline_county	carroll_county	cecil_county	charles_county	...	kent_county	montgomery_county	prince_george_s_county	queen_anne_s_county	somerset_county	st_mary_s_county	talbot_county	washington_county	wicomico_county	worcester_county
year
2010	9.9	17.1	6.6	24.7	8.2	6.2	13.0	5.4	10.5	6.2	...	14.2	7.5	9.4	7.3	19.3	7.5	9.7	11.4	16.6	10.6
2011	10.2	19.1	6.5	24.5	9.6	6.1	13.1	5.5	9.7	7.7	...	13.9	6.7	9.4	8.7	26.2	8.6	10.8	11.8	17.7	13.0
2012	10.4	18.1	6.3	24.5	9.7	7.0	15.7	6.3	11.9	8.6	...	14.0	6.6	10.3	8.2	29.6	8.4	9.7	13.7	16.7	11.1
2013	10.2	18.6	7.3	22.7	9.5	6.9	16.7	6.8	9.8	8.0	...	14.9	7.0	9.9	8.4	28.5	8.2	10.9	12.0	16.5	13.1
2014	10.4	18.5	6.7	23.3	9.8	7.2	16.0	5.9	10.6	7.2	...	13.8	7.2	10.3	7.5	25.5	8.6	11.7	13.8	16.9	11.9

Analysis of Income in Maryland¶

Introduction¶

Importing Libraries¶

Data Collection¶

Importing Data¶

Data Management¶

Missing Entries¶

Dropping Unused Data¶

Combining Data¶

Exploratory Data Analysis And Data Visualization¶

Analysis of Income Trend¶

Analysis of Poverty Rate¶

Hypothesis Testing¶

Predicting Maryland State Income¶

Poverty Rates and Income Correlation¶

Insights Attained¶

		income_per_capita	median_income	poverty_rate
years	county
2010	allegany_county	33436	37083	17.1
	anne_arundel_county	55360	80908	6.6
	baltimore_city	39699	38186	24.7
	baltimore_county	51519	62300	8.2
	calvert_county	52421	86536	6.2

Dep. Variable:	income_per_capita	R-squared:	0.523
Model:	OLS	Adj. R-squared:	0.521
Method:	Least Squares	F-statistic:	260.7
Date:	Thu, 12 May 2022	Prob (F-statistic):	4.16e-40
Time:	04:23:15	Log-Likelihood:	-2489.8
No. Observations:	240	AIC:	4984.
Df Residuals:	238	BIC:	4991.
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	6.672e+04	1174.285	56.817	0.000	6.44e+04	6.9e+04
poverty_rate	-1519.6825	94.115	-16.147	0.000	-1705.088	-1334.277

Omnibus:	44.671	Durbin-Watson:	1.866
Prob(Omnibus):	0.000	Jarque-Bera (JB):	64.169
Skew:	1.148	Prob(JB):	1.16e-14
Kurtosis:	4.070	Cond. No.	29.3

Dep. Variable:	median_income	R-squared:	0.745
Model:	OLS	Adj. R-squared:	0.744
Method:	Least Squares	F-statistic:	697.1
Date:	Thu, 12 May 2022	Prob (F-statistic):	1.14e-72
Time:	04:23:16	Log-Likelihood:	-2568.6
No. Observations:	240	AIC:	5141.
Df Residuals:	238	BIC:	5148.
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	10.874	Durbin-Watson:	2.037
Prob(Omnibus):	0.004	Jarque-Bera (JB):	11.641
Skew:	0.532	Prob(JB):	0.00297
Kurtosis:	2.824	Cond. No.	29.3

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	1.083e+05	1630.796	66.420	0.000	1.05e+05	1.12e+05
poverty_rate	-3450.8962	130.703	-26.402	0.000	-3708.380	-3193.413