Richard Waterman
November 2023
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime, date
# Read in some data
os.chdir('C:\\Users\\water\\Dropbox (Penn)\\Teaching\\4770f2023\\DataSets')
op_data = pd.read_csv("Outpatient.csv", parse_dates=['SchedDate', 'ApptDate'])
op_data['ScheduleLag'] = op_data['ApptDate'] - op_data['SchedDate']
Index(['PID', 'SchedDate', 'ApptDate', 'Dept', 'Language', 'Sex', 'Age',
'Race', 'Status', 'ScheduleLag'],
sns.set() # The default theme.
sns.set(rc={'figure.figsize':(4.5,3.18)}) # A default plot size for axes level plots.
op_data['SL'] = op_data['ScheduleLag'].dt.days # Create a new variable that has schedule lag in days.
sns.boxplot(x='SL', data=op_data); # Create a default boxplot. The semi-colon is a trick to stop unwanted output on the terminal.
pass # Another way to surpress unwanted text output from the plot commands.
# Note below that the data argument is not quoted, but the x argument is quoted.
# "Flier" is the name for the outliers.
sns.boxplot(x='SL', data = op_data, color='red', fliersize=1.0);
g = sns.boxplot(x='SL', data = op_data, color='red', fliersize=1.0, notch=True, sym="");
<class 'matplotlib.axes._subplots.AxesSubplot'>
g = sns.catplot(x='SL', data = op_data, kind="box", height = 2, aspect=2) # A boxplot, from the "catplot" figure level command.
g.set_xlabels("Schedule lag") # You can tweak these plots using built in methods.
g.fig.suptitle('Distribution of schedule lags')
<class 'seaborn.axisgrid.FacetGrid'>
g = sns.catplot(x='SL', data = op_data, kind="violin",height=4) # A boxplot, from the "catplot" figure level command.
g.set_xlabels("Schedule lag") # You can tweak these plots using built in methods.
g.fig.suptitle('Distribution of schedule lags');
sns.displot(op_data['SL'], kde=False, rug=True,height=4, aspect=2); # Remove the kernel density estimate and add a rug plot.
import os
os.chdir('C:\\Users\\water\\Dropbox (Penn)\\Teaching\\4770f2023\\Images')
sns.displot(op_data['SL'], kde=True, height=4, aspect=2);
plt.savefig("output_{0}.png".format('OP')) # savefig method for png format.
plt.savefig("output_{0}.jpeg".format('OP')) # savefig method for jpeg format.
plt.savefig("output_{0}.svg".format('OP')) # savefig method for svg format.
#### Build the plot
g = sns.catplot(x='Dept', kind="count", palette="ch:.25", data=new_dept, height=3, aspect=2) #ch stands for a "cube-helix" color palette.
g.set_xticklabels(rotation=45, horizontalalignment='right')
plt.subplots_adjust(bottom=0.4) # This adds more white space to the bottom of the plot.
plt.savefig("output_{0}.png".format('Dept')) # savefig method.
new_dept = op_data.loc[op_data['Dept'].isin(top_dept.index)] # Keep all of the columns now.
g = sns.catplot(x='Dept', kind="count", palette="ch:.25", data=new_dept,
col="Sex", height=4, aspect=2) # Note the 'col' argument.
g.set_xticklabels(rotation=45, horizontalalignment='right');
g = sns.catplot(x='Dept', kind="count", palette="ch:.25", data=new_dept,
row = 'Sex', col="Status", height=2, aspect=1.5) # Note the 'col' argument.
g.set_xticklabels(rotation=45, horizontalalignment='right')
dept_counts = op_data['Dept'].value_counts()[:5] # Just work with the frequencies here.
dept_counts.plot.pie(figsize=(6, 6)); # This is a pandas plot.
g = sns.catplot(x='Dept', kind="count", palette=custom_palette, data=new_dept, height=4, aspect=2)
g.set_xticklabels(rotation=45, horizontalalignment='right');
g = sns.catplot(x='Dept', kind="count", palette=wharton_colors, data=new_dept, height=3, aspect=2)
g.set_xticklabels(rotation=45, horizontalalignment='right');
sns.set_palette(sns.color_palette("flare")) # Use a different palette.
new_dept = op_data.loc[op_data['Dept'].isin(top_dept.index)] #Subset the data.
g = sns.catplot(x = 'Dept', y = "SL", data = new_dept, height=4, aspect=2) # The default "catplot"
g.set_xticklabels(rotation=45, horizontalalignment='right');
sns.set_palette(sns.color_palette("Set2")) # Use a different palette.
g = sns.catplot(x='Dept',y="SL", kind="box", data=new_dept, height=3, aspect=2) # The comparison boxplots
g.set_xticklabels(rotation=45, horizontalalignment='right');
g = sns.catplot(x ='Dept', y="SL", kind="violin", data=new_dept, height=3, aspect=2) # The default "catplot"
g.set_xticklabels(rotation=45, horizontalalignment='right');
sns.catplot(x = 'Dept', y = "SL", hue ='Status', kind="box", data = new_dept, height=3, aspect=3); # Note the 'hue' argument.
f, ax = plt.subplots(1, 1, figsize = (10, 5)) # Set the size of the plot.
sns.boxplot(x = 'Dept', y = "SL", hue = 'Status', data = new_dept, ax=ax); # Note we are back to boxplot.
plt.savefig("output_{0}.png".format('Comps')) # savefig method for png format.
g = sns.catplot(x='Dept', y="SL", kind="bar", hue='Status', data=new_dept,
height=4, aspect=3, errcolor="red", errorbar=('ci', 95)) # The 'bar' kind.
g.set_ylabels("Average Schedule lag");
os.chdir('C:\\Users\\water\\Dropbox (Penn)\\Teaching\\4770f2023\\DataSets')
car_data = pd.read_csv("Car08_just_499.csv")
Index(['Make/Model', 'MPG_City', 'MPG_Hwy', 'Weight(lb)', 'Seating',
'Horsepower', 'HP/Pound', 'Displacement', 'Cylinders', 'Origin',
'Transmission', 'EPA_Class', 'Length', 'Fuel', 'HEV', 'Turbocharger',
'Make', 'Model', 'GP1000M_City', 'GP1000M_Hwy'],
This is a simple plot of the two variables.
sns.relplot(x="Weight(lb)", y="GP1000M_City", hue = "Transmission", style="Cylinders", data=car_data);
sns.relplot(x="Weight(lb)", y="GP1000M_City", hue="Transmission", style="Cylinders", palette="copper", data=car_data);
sns.relplot(x="Weight(lb)", y="GP1000M_City", hue = "Transmission", size = "Horsepower",
sizes=(20, 200), style="Cylinders", palette="Reds", data=car_data, height=3, aspect=2);
sns.jointplot(x="Weight(lb)", y="GP1000M_City", data=car_data, color="red", kind="kde", height=5);
sns.set(style="whitegrid", font_scale=0.75) # Change the style
tmp_data = car_data[['GP1000M_City', 'Weight(lb)', 'Horsepower', 'Length', 'Transmission']]
sns.pairplot(tmp_data, hue="Transmission", height=2);
plt.savefig("output_{0}.png".format('Cars')) # savefig method for png format
os.chdir('C:\\Users\\water\\Dropbox (Penn)\\Teaching\\4770f2023\\Notes\images') # This is where the plot will be saved.
op_data['SL_8Cut'] = pd.qcut(op_data['SL'], 8) # Create 8 levels with equal numbers in each category.
fig, ax1 = plt.subplots(figsize=(18, 9)) # Controlling plot size using matplotlib. This returns a figure to plot on.
plt.rcParams['font.size'] = 12
plt.rcParams['text.color'] = 'black'
mosaic(op_data, ['SL_8Cut', 'Status'], ax=ax1); # Plots the mosaic plot on the axes "ax1".
plt.savefig("mosaic_{0}.png".format('OP')) # savefig method for png format.
plt.close() # This will stop the plot being displayed here.