import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import linregress
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

df = pd.read_csv(r'honey_production.csv')


df = df.rename(columns={
'state': 'State',
'numcol': 'Number_of_Colonies',
'yieldpercol': 'Yield_per_Colony',
'totalprod': 'Total_Production',
'stocks': 'Stocks_Held',
'priceperlb': 'Price_per_Pound',
'prodvalue': 'Prod_Value_USD',
'year': 'Year'
})

df[['Number_of_Colonies', 'Total_Production', 'Stocks_Held', 'Prod_Value_USD']] = df[['Number_of_Colonies', 'Total_Production', 'Stocks_Held', 'Prod_Value_USD']].astype(int)

df.head()


df_year = df[['Year','Stocks_Held','Prod_Value_USD','Number_of_Colonies','Total_Production']].groupby('Year').sum().reset_index()

# Create lagged 'Total_Production_Diff' column showing year-over-year difference
df_year['Total_Production_lag'] = df_year['Total_Production'].shift(1)
df_year['Total_Production_Diff'] = df_year['Total_Production'] - df_year['Total_Production_lag'] 
df_year = df_year.drop(['Total_Production_lag'], axis=1)
df_year['Total_Production_Diff'] = df_year['Total_Production_Diff'].fillna(0).astype(int)

# Create lagged 'Number_of_Colonies_Diff' showing year-over-year difference
df_year['Number_of_Colonies_lag'] = df_year['Number_of_Colonies'].shift(1)
df_year['Number_of_Colonies_Diff'] = df_year['Number_of_Colonies'] - df_year['Number_of_Colonies_lag'] 
df_year = df_year.drop(['Number_of_Colonies_lag'], axis=1)
df_year['Number_of_Colonies_Diff'] = df_year['Number_of_Colonies_Diff'].fillna(0).astype(int)

df_year_styled = df_year.style.highlight_max(subset=['Total_Production', 'Total_Production_Diff', 'Number_of_Colonies_Diff', 'Number_of_Colonies','Prod_Value_USD', 'Stocks_Held'], color='#85e62c').highlight_min(subset=['Total_Production', 'Total_Production_Diff', 'Number_of_Colonies_Diff', 'Number_of_Colonies','Prod_Value_USD', 'Stocks_Held'], color='#ff6242')

df_year_styled


df_for_yoy = df_year[df_year['Year']>1998]

for i, bar in enumerate(plt.bar(df_for_yoy['Year'], df_for_yoy['Total_Production_Diff']/1000000, color = ['red' if yi < 0 else 'green' for yi in df_for_yoy['Total_Production_Diff']/1000000])):
    height = bar.get_height()
    if height >= 0: # if column is positive, add data label on top, else add on bottom
        plt.text(bar.get_x() + bar.get_width()/2, height+1, str(int(height)), ha='center', va='bottom')
    else:
        plt.text(bar.get_x() + bar.get_width()/2, height-1, str(int(height)), ha='center', va='top')
plt.xlabel('Year')
plt.ylabel("Production difference (millions lbs)")
plt.title("Difference in Honey Production Year-over-Year")
plt.gca().set_facecolor('#ECECEC')

# Set the y-axis limit (so data labels not cut off)
plt.ylim(top=max(df_for_yoy['Total_Production_Diff']/1000000)+max(df_for_yoy['Total_Production_Diff']/1000000/6))
plt.ylim(bottom=min(df_for_yoy['Total_Production_Diff']/1000000)+min(df_for_yoy['Total_Production_Diff']/1000000/6))

(-39.445, 35.263666666666666)


for i, bar in enumerate(plt.bar(df_for_yoy['Year'], df_for_yoy['Number_of_Colonies_Diff']/1000, color = ['red' if yi < 0 else 'green' for yi in df_for_yoy['Number_of_Colonies_Diff']/1000])):
    height = bar.get_height()
    if height >= 0: # if column is positive, add data label on top, else add on bottom
        plt.text(bar.get_x() + bar.get_width()/2, height+1, str(int(height)), ha='center', va='bottom')
    else:
        plt.text(bar.get_x() + bar.get_width()/2, height-1, str(int(height)), ha='center', va='top')
plt.xlabel('Year')
plt.ylabel("Difference in number of colonies (thousands)")
plt.title("Difference in Number of Colonies Year-over-Year")
plt.gca().set_facecolor('#ECECEC')

# Set the y-axis limit (so data labels not cut off)
plt.ylim(top=max(df_for_yoy['Number_of_Colonies_Diff']/1000)+max(df_for_yoy['Number_of_Colonies_Diff']/1000/6))
plt.ylim(bottom=min(df_for_yoy['Number_of_Colonies_Diff']/1000)+min(df_for_yoy['Number_of_Colonies_Diff']/1000/6))

(-233.33333333333334, 226.33333333333334)


plt.figure(figsize=(14,8))
plt.plot(df_year['Year'],df_year['Prod_Value_USD']/1000000, linewidth=3, marker='o', markeredgewidth=11, color='red')
plt.title('Value of Honey Production Over Time', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylim(0, None)
plt.ylabel('Production value (millions USD)', fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=14)
plt.gca().set_facecolor('#ECECEC')

# Create and plot trend line
slope, intercept, r_value, p_value, std_err = linregress(df_year['Year'], df_year['Prod_Value_USD']/1000000)
plt.plot(df_year['Year'], intercept + slope*df_year['Year'], 'r', label='fitted line', linestyle='--')

[<matplotlib.lines.Line2D at 0x1c5c4328820>]


plt.figure(figsize=(14,8))
plt.plot(df_year['Year'],df_year['Total_Production']/1000000, linewidth=3, marker='o', markeredgewidth=11, color='red')
plt.title('Total Honey Production Over Time', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylim(0, None)
plt.ylabel('Honey production (millions lbs)', fontsize = 14)
plt.tick_params(axis='both', which='major', labelsize=14)
plt.gca().set_facecolor('#ECECEC')

# Create and plot trend line
slope, intercept, r_value, p_value, std_err = linregress(df_year['Year'], df_year['Total_Production']/1000000)
plt.plot(df_year['Year'], intercept + slope*df_year['Year'], 'r', label='fitted line', linestyle='--')

# Set the y-axis limit
plt.ylim(top=max(df_year['Total_Production']/1000000)*1.1)

(0.0, 241.5138)


df_2012_state_yield = pysqldf("""
SELECT
State, Yield_per_Colony
FROM df
WHERE Year = 2012
GROUP BY State
ORDER BY Yield_per_Colony DESC
""")

df_2012_state_yield.head(10)


# Normalize the data to be between 0 and 1 for colouring
norm = plt.Normalize(min(df_2012_state_yield['Yield_per_Colony']), max(df_2012_state_yield['Yield_per_Colony']))
colors = plt.cm.Reds(norm(df_2012_state_yield['Yield_per_Colony']))

# Plot the bar chart
df_2012_state_yield.plot.bar(x='State',
                                 y='Yield_per_Colony',
                                 rot=90,
                                 fontsize=9,
                                 figsize=(14,8),
                                 color=colors,
                                 edgecolor='black',
                                 linewidth=1,
                                 legend= None,
                                 ylabel = 'Yield per colony (lbs)',
                                 title='Honey Yield per Colony (2012)')

plt.gca().set_facecolor('#ECECEC')

# Set the y-axis limit (so data labels not cut off)
plt.ylim(top=max(df_2012_state_yield['Yield_per_Colony'])+max(df_2012_state_yield['Yield_per_Colony']/12))

# Add data labels on top of the bars
for i, v in enumerate(df_2012_state_yield['Yield_per_Colony']):
    plt.text(i-0.3, v+max(df_2012_state_yield['Yield_per_Colony']/50), str(v), rotation=90)

	State	Number_of_Colonies	Yield_per_Colony	Total_Production	Stocks_Held	Price_per_Pound	Prod_Value_USD	Year
0	AL	16000	71	1136000	159000	0.72	818000	1998
1	AZ	55000	60	3300000	1485000	0.64	2112000	1998
2	AR	53000	65	3445000	1688000	0.59	2033000	1998
3	CA	450000	83	37350000	12326000	0.62	23157000	1998
4	CO	27000	72	1944000	1594000	0.70	1361000	1998

	Year	Stocks_Held	Prod_Value_USD	Number_of_Colonies	Total_Production	Total_Production_Diff	Number_of_Colonies_Diff
0	1998	80565000	146091000	2621000	219519000	0	0
1	1999	78390000	123657000	2637000	202387000	-17132000	16000
2	2000	84956000	131568000	2604000	219558000	17171000	-33000
3	2001	64764000	132282000	2542000	185748000	-33810000	-62000
4	2002	39177000	227302000	2565000	171265000	-14483000	23000
5	2003	40619000	252079000	2591000	181372000	10107000	26000
6	2004	60981000	197307000	2539000	182729000	1357000	-52000
7	2005	62187000	160793000	2394000	173969000	-8760000	-145000
8	2006	60235000	157924000	2378000	154238000	-19731000	-16000
9	2007	52405000	161356000	2428000	147621000	-6617000	50000
10	2008	50996000	229992000	2326000	162972000	15351000	-102000
11	2009	37322000	213920000	2472000	145068000	-17904000	146000
12	2010	44797000	278370000	2666000	175294000	30226000	194000
13	2011	36550000	258688000	2466000	147201000	-28093000	-200000
14	2012	31662000	280725000	2509000	140907000	-6294000	43000

	State	Yield_per_Colony
0	MS	118
1	LA	86
2	HI	75
3	WI	69
4	ND	69
5	MN	67
6	NE	65
7	FL	64
8	SD	63
9	AR	63

Import modules and data¶

Dataframe preparation¶

Group data and create year-over-year difference columns¶

Create year-over-year honey production plot¶

Create year-over-year colony number plot¶

Create plot of honey production value over time¶

Create plot of honey production amount over time¶

Create plot of honey production value by state (using SQL)¶