import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import linregress
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
df = pd.read_csv(r'honey_production.csv')
df = df.rename(columns={
'state': 'State',
'numcol': 'Number_of_Colonies',
'yieldpercol': 'Yield_per_Colony',
'totalprod': 'Total_Production',
'stocks': 'Stocks_Held',
'priceperlb': 'Price_per_Pound',
'prodvalue': 'Prod_Value_USD',
'year': 'Year'
})
df[['Number_of_Colonies', 'Total_Production', 'Stocks_Held', 'Prod_Value_USD']] = df[['Number_of_Colonies', 'Total_Production', 'Stocks_Held', 'Prod_Value_USD']].astype(int)
df.head()
| State | Number_of_Colonies | Yield_per_Colony | Total_Production | Stocks_Held | Price_per_Pound | Prod_Value_USD | Year | |
|---|---|---|---|---|---|---|---|---|
| 0 | AL | 16000 | 71 | 1136000 | 159000 | 0.72 | 818000 | 1998 |
| 1 | AZ | 55000 | 60 | 3300000 | 1485000 | 0.64 | 2112000 | 1998 |
| 2 | AR | 53000 | 65 | 3445000 | 1688000 | 0.59 | 2033000 | 1998 |
| 3 | CA | 450000 | 83 | 37350000 | 12326000 | 0.62 | 23157000 | 1998 |
| 4 | CO | 27000 | 72 | 1944000 | 1594000 | 0.70 | 1361000 | 1998 |
df_year = df[['Year','Stocks_Held','Prod_Value_USD','Number_of_Colonies','Total_Production']].groupby('Year').sum().reset_index()
# Create lagged 'Total_Production_Diff' column showing year-over-year difference
df_year['Total_Production_lag'] = df_year['Total_Production'].shift(1)
df_year['Total_Production_Diff'] = df_year['Total_Production'] - df_year['Total_Production_lag']
df_year = df_year.drop(['Total_Production_lag'], axis=1)
df_year['Total_Production_Diff'] = df_year['Total_Production_Diff'].fillna(0).astype(int)
# Create lagged 'Number_of_Colonies_Diff' showing year-over-year difference
df_year['Number_of_Colonies_lag'] = df_year['Number_of_Colonies'].shift(1)
df_year['Number_of_Colonies_Diff'] = df_year['Number_of_Colonies'] - df_year['Number_of_Colonies_lag']
df_year = df_year.drop(['Number_of_Colonies_lag'], axis=1)
df_year['Number_of_Colonies_Diff'] = df_year['Number_of_Colonies_Diff'].fillna(0).astype(int)
df_year_styled = df_year.style.highlight_max(subset=['Total_Production', 'Total_Production_Diff', 'Number_of_Colonies_Diff', 'Number_of_Colonies','Prod_Value_USD', 'Stocks_Held'], color='#85e62c').highlight_min(subset=['Total_Production', 'Total_Production_Diff', 'Number_of_Colonies_Diff', 'Number_of_Colonies','Prod_Value_USD', 'Stocks_Held'], color='#ff6242')
df_year_styled
| Year | Stocks_Held | Prod_Value_USD | Number_of_Colonies | Total_Production | Total_Production_Diff | Number_of_Colonies_Diff | |
|---|---|---|---|---|---|---|---|
| 0 | 1998 | 80565000 | 146091000 | 2621000 | 219519000 | 0 | 0 |
| 1 | 1999 | 78390000 | 123657000 | 2637000 | 202387000 | -17132000 | 16000 |
| 2 | 2000 | 84956000 | 131568000 | 2604000 | 219558000 | 17171000 | -33000 |
| 3 | 2001 | 64764000 | 132282000 | 2542000 | 185748000 | -33810000 | -62000 |
| 4 | 2002 | 39177000 | 227302000 | 2565000 | 171265000 | -14483000 | 23000 |
| 5 | 2003 | 40619000 | 252079000 | 2591000 | 181372000 | 10107000 | 26000 |
| 6 | 2004 | 60981000 | 197307000 | 2539000 | 182729000 | 1357000 | -52000 |
| 7 | 2005 | 62187000 | 160793000 | 2394000 | 173969000 | -8760000 | -145000 |
| 8 | 2006 | 60235000 | 157924000 | 2378000 | 154238000 | -19731000 | -16000 |
| 9 | 2007 | 52405000 | 161356000 | 2428000 | 147621000 | -6617000 | 50000 |
| 10 | 2008 | 50996000 | 229992000 | 2326000 | 162972000 | 15351000 | -102000 |
| 11 | 2009 | 37322000 | 213920000 | 2472000 | 145068000 | -17904000 | 146000 |
| 12 | 2010 | 44797000 | 278370000 | 2666000 | 175294000 | 30226000 | 194000 |
| 13 | 2011 | 36550000 | 258688000 | 2466000 | 147201000 | -28093000 | -200000 |
| 14 | 2012 | 31662000 | 280725000 | 2509000 | 140907000 | -6294000 | 43000 |
df_for_yoy = df_year[df_year['Year']>1998]
for i, bar in enumerate(plt.bar(df_for_yoy['Year'], df_for_yoy['Total_Production_Diff']/1000000, color = ['red' if yi < 0 else 'green' for yi in df_for_yoy['Total_Production_Diff']/1000000])):
height = bar.get_height()
if height >= 0: # if column is positive, add data label on top, else add on bottom
plt.text(bar.get_x() + bar.get_width()/2, height+1, str(int(height)), ha='center', va='bottom')
else:
plt.text(bar.get_x() + bar.get_width()/2, height-1, str(int(height)), ha='center', va='top')
plt.xlabel('Year')
plt.ylabel("Production difference (millions lbs)")
plt.title("Difference in Honey Production Year-over-Year")
plt.gca().set_facecolor('#ECECEC')
# Set the y-axis limit (so data labels not cut off)
plt.ylim(top=max(df_for_yoy['Total_Production_Diff']/1000000)+max(df_for_yoy['Total_Production_Diff']/1000000/6))
plt.ylim(bottom=min(df_for_yoy['Total_Production_Diff']/1000000)+min(df_for_yoy['Total_Production_Diff']/1000000/6))
(-39.445, 35.263666666666666)
for i, bar in enumerate(plt.bar(df_for_yoy['Year'], df_for_yoy['Number_of_Colonies_Diff']/1000, color = ['red' if yi < 0 else 'green' for yi in df_for_yoy['Number_of_Colonies_Diff']/1000])):
height = bar.get_height()
if height >= 0: # if column is positive, add data label on top, else add on bottom
plt.text(bar.get_x() + bar.get_width()/2, height+1, str(int(height)), ha='center', va='bottom')
else:
plt.text(bar.get_x() + bar.get_width()/2, height-1, str(int(height)), ha='center', va='top')
plt.xlabel('Year')
plt.ylabel("Difference in number of colonies (thousands)")
plt.title("Difference in Number of Colonies Year-over-Year")
plt.gca().set_facecolor('#ECECEC')
# Set the y-axis limit (so data labels not cut off)
plt.ylim(top=max(df_for_yoy['Number_of_Colonies_Diff']/1000)+max(df_for_yoy['Number_of_Colonies_Diff']/1000/6))
plt.ylim(bottom=min(df_for_yoy['Number_of_Colonies_Diff']/1000)+min(df_for_yoy['Number_of_Colonies_Diff']/1000/6))
(-233.33333333333334, 226.33333333333334)
plt.figure(figsize=(14,8))
plt.plot(df_year['Year'],df_year['Prod_Value_USD']/1000000, linewidth=3, marker='o', markeredgewidth=11, color='red')
plt.title('Value of Honey Production Over Time', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylim(0, None)
plt.ylabel('Production value (millions USD)', fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=14)
plt.gca().set_facecolor('#ECECEC')
# Create and plot trend line
slope, intercept, r_value, p_value, std_err = linregress(df_year['Year'], df_year['Prod_Value_USD']/1000000)
plt.plot(df_year['Year'], intercept + slope*df_year['Year'], 'r', label='fitted line', linestyle='--')
[<matplotlib.lines.Line2D at 0x1c5c4328820>]
plt.figure(figsize=(14,8))
plt.plot(df_year['Year'],df_year['Total_Production']/1000000, linewidth=3, marker='o', markeredgewidth=11, color='red')
plt.title('Total Honey Production Over Time', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylim(0, None)
plt.ylabel('Honey production (millions lbs)', fontsize = 14)
plt.tick_params(axis='both', which='major', labelsize=14)
plt.gca().set_facecolor('#ECECEC')
# Create and plot trend line
slope, intercept, r_value, p_value, std_err = linregress(df_year['Year'], df_year['Total_Production']/1000000)
plt.plot(df_year['Year'], intercept + slope*df_year['Year'], 'r', label='fitted line', linestyle='--')
# Set the y-axis limit
plt.ylim(top=max(df_year['Total_Production']/1000000)*1.1)
(0.0, 241.5138)
The year 2012 was chosen because it is the final year in the dataset.
df_2012_state_yield = pysqldf("""
SELECT
State, Yield_per_Colony
FROM df
WHERE Year = 2012
GROUP BY State
ORDER BY Yield_per_Colony DESC
""")
df_2012_state_yield.head(10)
| State | Yield_per_Colony | |
|---|---|---|
| 0 | MS | 118 |
| 1 | LA | 86 |
| 2 | HI | 75 |
| 3 | WI | 69 |
| 4 | ND | 69 |
| 5 | MN | 67 |
| 6 | NE | 65 |
| 7 | FL | 64 |
| 8 | SD | 63 |
| 9 | AR | 63 |
# Normalize the data to be between 0 and 1 for colouring
norm = plt.Normalize(min(df_2012_state_yield['Yield_per_Colony']), max(df_2012_state_yield['Yield_per_Colony']))
colors = plt.cm.Reds(norm(df_2012_state_yield['Yield_per_Colony']))
# Plot the bar chart
df_2012_state_yield.plot.bar(x='State',
y='Yield_per_Colony',
rot=90,
fontsize=9,
figsize=(14,8),
color=colors,
edgecolor='black',
linewidth=1,
legend= None,
ylabel = 'Yield per colony (lbs)',
title='Honey Yield per Colony (2012)')
plt.gca().set_facecolor('#ECECEC')
# Set the y-axis limit (so data labels not cut off)
plt.ylim(top=max(df_2012_state_yield['Yield_per_Colony'])+max(df_2012_state_yield['Yield_per_Colony']/12))
# Add data labels on top of the bars
for i, v in enumerate(df_2012_state_yield['Yield_per_Colony']):
plt.text(i-0.3, v+max(df_2012_state_yield['Yield_per_Colony']/50), str(v), rotation=90)