Exploring Two or More Variables¶

Hexagonal Binning and Contours (Plotting Numeric Versus Numeric Data)¶

In [1]:

Copied!

import pandas as pd
import seaborn as sns
import pandas as pd
import seaborn as sns

In [2]:

Copied!

kc_tax0 = pd.read_csv('../data/kc_tax.csv')
kc_tax0
kc_tax0 = pd.read_csv('../data/kc_tax.csv')
kc_tax0

Out[2]:

	TaxAssessedValue	SqFtTotLiving	ZipCode
0	NaN	1730	98117.0
1	206000.0	1870	98002.0
2	303000.0	1530	98166.0
3	361000.0	2000	98108.0
4	459000.0	3150	98108.0
...	...	...	...
498244	375000.0	2230	98056.0
498245	316000.0	1710	98056.0
498246	340000.0	1930	98056.0
498247	132000.0	2930	98056.0
498248	286000.0	1310	98056.0

498249 rows × 3 columns

In [3]:

Copied!





kc_tax0_strip = kc_tax0.loc[
    (kc_tax0["TaxAssessedValue"] < 750_000) &
    (kc_tax0["SqFtTotLiving"] > 100) &
    (kc_tax0["SqFtTotLiving"] < 3500),
    :
]

kc_tax0_strip
kc_tax0_strip = kc_tax0.loc[
    (kc_tax0["TaxAssessedValue"] < 750_000) &
    (kc_tax0["SqFtTotLiving"] > 100) &
    (kc_tax0["SqFtTotLiving"] < 3500),
    :
]

kc_tax0_strip

Out[3]:

	TaxAssessedValue	SqFtTotLiving	ZipCode
1	206000.0	1870	98002.0
2	303000.0	1530	98166.0
3	361000.0	2000	98108.0
4	459000.0	3150	98108.0
5	223000.0	1570	98032.0
...	...	...	...
498244	375000.0	2230	98056.0
498245	316000.0	1710	98056.0
498246	340000.0	1930	98056.0
498247	132000.0	2930	98056.0
498248	286000.0	1310	98056.0

432693 rows × 3 columns

In [4]:

Copied!

ax = kc_tax0_strip.plot.hexbin(x="SqFtTotLiving", y="TaxAssessedValue", gridsize=30, sharex=False, figsize=(5,4))
ax = kc_tax0_strip.plot.hexbin(x="SqFtTotLiving", y="TaxAssessedValue", gridsize=30, sharex=False, figsize=(5,4))

No description has been provided for this image

As you can see, hexagonal binning plot shows the tax-assessed value, square foot and the count of records for bins at the same time.

Contours¶

We take a sample from our dataset because it takes too much time to render.

In [5]:

Copied!

sample = kc_tax0_strip.sample(n=5000, random_state=42)
sample = kc_tax0_strip.sample(n=5000, random_state=42)

In [6]:

Copied!

ax = sns.kdeplot(x=sample.SqFtTotLiving, y=sample.TaxAssessedValue, fill=True)
ax = sns.kdeplot(x=sample.SqFtTotLiving, y=sample.TaxAssessedValue, fill=True)

Contingency Tables¶

In [7]:

Copied!

lc_loans = pd.read_csv("../data/lc_loans.csv")
lc_loans
lc_loans = pd.read_csv("../data/lc_loans.csv")
lc_loans

Out[7]:

	status	grade
0	Fully Paid	B
1	Charged Off	C
2	Fully Paid	C
3	Fully Paid	C
4	Current	B
...	...	...
450956	Current	D
450957	Current	D
450958	Current	D
450959	Current	D
450960	Fully Paid	A

450961 rows × 2 columns

In [8]:

Copied!

crosstab = lc_loans.pivot_table(index="grade", columns="status", aggfunc=lambda x: len(x), margins=True)
crosstab
crosstab = lc_loans.pivot_table(index="grade", columns="status", aggfunc=lambda x: len(x), margins=True)
crosstab

Out[8]:

status	Charged Off	Current	Fully Paid	Late	All
grade
A	1562	50051	20408	469	72490
B	5302	93852	31160	2056	132370
C	6023	88928	23147	2777	120875
D	5007	53281	13681	2308	74277
E	2842	24639	5949	1374	34804
F	1526	8444	2328	606	12904
G	409	1990	643	199	3241
All	22671	321185	97316	9789	450961

In [9]:

Copied!





columns = crosstab.loc[:, "Charged Off":"All"].columns
crosstab[columns] = crosstab[columns].astype('float', copy=False)

df = crosstab.loc['A':'G',:].copy()
df.loc[:,'Charged Off':'Late'] = df.loc[:, 'Charged Off':'Late'].div(df["All"], axis='index')

df['All'] = df['All'] / sum(df['All'])
perc_crosstab = df
perc_crosstab
columns = crosstab.loc[:, "Charged Off":"All"].columns
crosstab[columns] = crosstab[columns].astype('float', copy=False)

df = crosstab.loc['A':'G',:].copy()
df.loc[:,'Charged Off':'Late'] = df.loc[:, 'Charged Off':'Late'].div(df["All"], axis='index')

df['All'] = df['All'] / sum(df['All'])
perc_crosstab = df
perc_crosstab

Out[9]:

status	Charged Off	Current	Fully Paid	Late	All
grade
A	0.021548	0.690454	0.281528	0.006470	0.160746
B	0.040054	0.709013	0.235401	0.015532	0.293529
C	0.049828	0.735702	0.191495	0.022974	0.268039
D	0.067410	0.717328	0.184189	0.031073	0.164708
E	0.081657	0.707936	0.170929	0.039478	0.077177
F	0.118258	0.654371	0.180409	0.046962	0.028614
G	0.126196	0.614008	0.198396	0.061401	0.007187

In [ ]:

Categorical and Numeric Data¶

In [10]:

Copied!

airline_stats = pd.read_csv("../data/airline_stats.csv")
airline_stats
airline_stats = pd.read_csv("../data/airline_stats.csv")
airline_stats

Out[10]:

	pct_carrier_delay	pct_atc_delay	pct_weather_delay	airline
0	8.153226	1.971774	0.762097	American
1	5.959924	3.706107	1.585878	American
2	7.157270	2.706231	2.026706	American
3	12.100000	11.033333	0.000000	American
4	7.333333	3.365591	1.774194	American
...	...	...	...	...
33463	6.186422	8.798491	1.651940	Southwest
33464	9.522167	3.591133	0.261084	Southwest
33465	9.164179	2.664179	0.343284	Southwest
33466	5.152293	1.964520	0.122817	Southwest
33467	3.964393	1.700479	0.019449	Southwest

33468 rows × 4 columns

In [11]:

Copied!

ax = airline_stats.boxplot(by="airline", column="pct_carrier_delay")
ax = airline_stats.boxplot(by="airline", column="pct_carrier_delay")

As informative as boxplots are, using a different type of plot here, which is violin plot, eliminates the ugly appearance caused by outliers.

In [12]:

Copied!

ax = sns.violinplot(x=airline_stats.airline, y=airline_stats.pct_carrier_delay, inner="quartile", color="white")
_ =  ax.set_ylabel("% of daily airline delays by carrier")
ax = sns.violinplot(x=airline_stats.airline, y=airline_stats.pct_carrier_delay, inner="quartile", color="white")
_ =  ax.set_ylabel("% of daily airline delays by carrier")

Visualizing Mulitple Variables¶

In [13]:

Copied!

zip_codes = [98188, 98105, 98108, 98126]
kc_tax_zip = kc_tax0.loc[kc_tax0.ZipCode.isin(zip_codes), :]
kc_tax_zip
zip_codes = [98188, 98105, 98108, 98126]
kc_tax_zip = kc_tax0.loc[kc_tax0.ZipCode.isin(zip_codes), :]
kc_tax_zip

Out[13]:

	TaxAssessedValue	SqFtTotLiving	ZipCode
3	361000.0	2000	98108.0
4	459000.0	3150	98108.0
10	202000.0	830	98108.0
11	210000.0	1130	98108.0
12	193000.0	1560	98108.0
...	...	...	...
498049	346000.0	1430	98105.0
498050	463000.0	1610	98105.0
498051	553000.0	1580	98105.0
498052	571000.0	1840	98105.0
498053	694000.0	2420	98105.0

22038 rows × 3 columns

In [14]:

Copied!





import matplotlib.pyplot as plt

def hexbin(x,y,color,**kwargs):
    cmap = sns.light_palette(color,as_cmap=True)
    plt.hexbin(x,y,gridsize=25, cmap=cmap, **kwargs)

g = sns.FacetGrid(kc_tax_zip, col="ZipCode", col_wrap=2)
g.map(hexbin, 'SqFtTotLiving', 'TaxAssessedValue', extent=[0,3500,0,700000])
g.set_axis_labels("Finished Square Feet", "Tax-Assessed Value")
g.set_titles("Zip code {col_name:.0f}")
import matplotlib.pyplot as plt

def hexbin(x,y,color,**kwargs):
    cmap = sns.light_palette(color,as_cmap=True)
    plt.hexbin(x,y,gridsize=25, cmap=cmap, **kwargs)

g = sns.FacetGrid(kc_tax_zip, col="ZipCode", col_wrap=2)
g.map(hexbin, 'SqFtTotLiving', 'TaxAssessedValue', extent=[0,3500,0,700000])
g.set_axis_labels("Finished Square Feet", "Tax-Assessed Value")
g.set_titles("Zip code {col_name:.0f}")

Out[14]:

<seaborn.axisgrid.FacetGrid at 0x7efc0478cd70>