Python Fundamentals: Data Types, Operations, and Libraries
Intro to Python
Operations with Other Types
monthly_savings = 10
num_months = 12
intro = "Hello! How are you?"
# Calculate year_savings using monthly_savings and num_months
year_savings = monthly_savings * num_months
# Print the type of year_savings
print(type(year_savings))
# Assign sum of intro and intro to doubleintro
doubleintro = intro + intro
# Print out doubleintro
print(doubleintro)
Type Conversion
savings = 100
total_savings = 150
# Fix the printout
print("I started with $" + str(savings) + " and now have $" + str(total_savings) + ". Awesome!")
# Definition of pi_string
pi_string = "3.1415926"
# Convert pi_string into float: pi_float
pi_float = float(pi_string)
print("My name is " + " " + "sid" * 3)
# Create areas_copy (if you don’t want to change the original list, then specify list or [:] with it)
areas_copy = list(areas)
Deletion from a List
del(areas[-4:-2])
In this, you give the index of the element to be removed.
List Functions
print(len(var1))
, max(list2)
, full_sorted = sorted(full, key=None, reverse=True)
-> full
key is an iterable element like a list.
pow()
takes three arguments: base, exp, and mod. base and exp are required arguments, mod is an optional argument. (The answer of power divided by the 3rd argument gives the remainder)
List Methods
list2.append(4)
, place_up = place.upper()
, print(place.count("o"))
, print(areas.index(20.0))
# Reverse the orders of the elements in areas
areas.reverse()
areas.remove()
In this, you give the element to remove.
1D Arrays Arithmetic
arrayadd = np.array([1, 2, 3]) + np.array([4, 5, 6])
arrayadd
array([5, 7, 9])
A 2D array is a list of lists.
(np_baseball.shape)
Array slicing rows, columns
baseball[0:2, 0]
The number of columns must be the same for 2D array operations. Scalar multiplication is possible.
print(np.mean(np_height_in))
# Print out the median of np_height_in
print(np.median(np_height_in))
Array Statistics
# Print mean height (first column)
avg = np.mean(np_baseball[:, 0])
print("Average: " + str(avg))
# Print median height. Replace 'None'
med = np.median(np_baseball[:, 0])
print("Median: " + str(med))
# Print out the standard deviation on height. Replace 'None'
stddev = np.std(np_baseball[:, 0])
print("Standard Deviation: " + str(stddev))
# Print out correlation between first and second column. Replace 'None'
corr = np.corrcoef(np_baseball[:, 0], np_baseball[:, 1])
print("Correlation: " + str(corr))
# Heights of the goalkeepers: gk_heights
gk_heights = np_heights[np_positions == 'GK']
# Heights of the other players: other_heights
other_heights = np_heights[np_positions != 'GK']
# Print out the median height of goalkeepers. Replace 'None'
print("Median height of goalkeepers: " + str(np.median(gk_heights)))
# Print out the median height of other players. Replace 'None'
print("Median height of other players: " + str(np.median(other_heights)))
Intermediate Python
import matplotlib.pyplot as plt
# Make a line plot: year on the x-axis, pop on the y-axis
plt.plot(year, pop)
– line plot
plt.scatter(gdp_cap, life_exp)
– scatter plot, plt.xscale('log')
, plt.hist(life_exp)
, plt.xlabel(xlab)
, plt.xticks(tick_val, tick_lab)
plt.ylabel(ylab)
plt.title(title)
plt.hist(life_exp, 5)
with 5 bins,
plt.show()
plt.clf()
# Double np_pop
np_pop = np_pop * 2
# Update: set s argument to np_pop
plt.scatter(gdp_cap, life_exp, s=np_pop, c=col, alpha=0.8)
# size, color, opacity
plt.text(5700, 80, 'China')
# Add grid() call
plt.grid(True)
Dictionaries
europe = {'spain': 'madrid', 'france': 'paris', 'germany': 'berlin', 'norway': 'oslo'}
# Print out the keys in europe
print(europe.keys())
# Print out value that belongs to key 'norway'
print(europe['norway'])
# Add and update work the same
europe['germany'] = 'berlin'
# Remove australia
del(europe['australia'])
# Create sub-dictionary data
data = {'capital': 'rome', 'population': 59.83}
# Add data to europe under key 'italy'
europe['italy'] = data
my_list = [1, 2, 3]
my_list[0] = 99 # Modifying an element
my_list.append(4) # Adding an element
del my_list[1] # Removing an element
import numpy as np
my_array = np.array([1, 2, 3])
my_array[0] = 99 # Modifying an element
my_array = np.append(my_array, 4) # Adding an element
my_array = np.delete(my_array, 1) # Removing an element (it's an index that is given)
If in a 2D array you want to add a row, then after it, you have to also reshape by (1, -1)
. -1 means it will decide columns by itself.
array4 = np.array([[1, 2, 3], [7, 8, 9]])
# List to append
new_list = np.array([4, 5, 6])
# Reshape the new list to be a 2D array (row vector)
new_list = new_list.reshape(1, -1)
# Append the new list to the existing array along axis=0
result_array = np.append(array4, new_list, axis=0)
print(result_array)
pd.concat
is used to combine dataframes, and if you want to add a row in a dataframe, then use append.
row_labels = ['US', 'AUS', 'JPN', 'IN', 'RU', 'MOR', 'EG']
# Specify row labels of cars
cars.index = row_labels
# Fix import by including index_col
cars = pd.read_csv('cars.csv', index_col=0)
pd.concat(df1, df2 to concate)
df.append(newrow, index_ignore=True/False)
In a dataframe, we print in the form of row observations like print(car[0:3])
Also like this:
df1.loc["BR":"CH", ["country"]]
df1.iloc[0:2, 0:1]
Don’t use a double bracket for a range, either for rows or columns.
print(my_kitchen > 10 and my_kitchen
# my_kitchen smaller than 14 or bigger than 17?
print(my_kitchen
my_house = np.array([18.0, 20.0, 10.75, 9.50])
your_house = np.array([14.0, 24.0, 14.25, 9.0])
# my_house greater than 18.5 or smaller than 10
print(np.logical_or(my_house > 18.5, my_house
# Both my_house and your_house smaller than 11
print(np.logical_and(my_house
if room == "kit":
print("looking around in the kitchen.")
elif room == "bed":
print("looking around in the bedroom.")
else:
print("looking around elsewhere.")
# Extract drives_right column as Series: dr
dr = cars['drives_right']
print(dr)
# Use dr to subset cars: sel
sel = cars[dr]
# Print sel
print(sel)
import pandas as pd
cars = pd.read_csv('cars.csv', index_col=0)
# Import numpy, you'll need this
import numpy as np
# Create medium: observations with cars_per_cap between 100 and 500
cpc = cars['cars_per_cap']
between = np.logical_and(cpc > 100, cpc
medium = cars[between]
While Loop
offset = -6
# Code the while loop
while offset != 0:
print("correcting...")
if offset > 0:
offset = offset - 1
else:
offset = offset + 1
print(offset)
areas = [11.25, 18.0, 20.0, 10.75, 9.50]
# Change for loop to use enumerate() and update print()
for x, y in enumerate(areas):
print("room" + ' ' + str(x) + ":" + " " + str(y))
house = [["hallway", 11.25],
["kitchen", 18.0],
["living room", 20.0],
["bedroom", 10.75],
["bathroom", 9.50]]
# Build a for loop from scratch
for hou in house:
x = hou[0]
y = hou[1]
print("the" + " " + str(x) + " " + "is" + " " + str(y) + " " + "sqm")
# Iterate over a dictionary
for key, value in europe.items():
print("the capital of " + " " + str(key) + " " + "is" + " " + str(value))
for country, country_info in europe.items():
print(country)
for key, value in country_info.items():
print(f" {key}: {value}")
print()
Iterate over a 2D array
for baseball in np.nditer(np_baseball):
print(baseball)
Iterate over a dataframe
cars = pd.read_csv('cars.csv', index_col=0)
# Iterate over rows of cars
for lab, row in cars.iterrows():
print(lab)
print(row)
To get values of a specific column:
for lab, row in cars.iterrows():
print(lab + ":" + " " + str(row['cars_per_cap']))
Add a new column:
for lab, row in brics.iterrows():
brics.loc[lab, "name_length"] = len(row["country"])
for lab, row in cars.iterrows():
cars.loc[lab, "COUNTRY"] = row["country"].upper()
Another way to add a column in a dataframe:
for lab, row in cars.iterrows():
cars["COUNTRY"] = cars["country"].apply(str.upper)
print(cars)
brics["name_length"] = brics["country"].apply(len)
Random Function
np.random.seed(123)
np.random.rand()
(it will give a float value between 0 & 1)
print(np.random.randint(1, 7))
Step Calculation
import numpy as np
np.random.seed(123)
# Initialize random_walk
random_walk = [0]
# Complete the ___
for x in range(100):
# Set step: last element in random_walk
step = random_walk[-1]
# Roll the dice
dice = np.random.randint(1, 7)
# Determine next step
if dice
# Replace below: use max to make sure step can't go below 0
step = max(0, step - 1)
elif dice
step = step + 1
else:
step = step + np.random.randint(1, 7)
# append next_step to random_walk
random_walk.append(step)
# Print random_walk
print(random_walk)
All Walks
all_walks = []
# Simulate random walk five times
for i in range(5):
# Code from before
random_walk = [0]
for x in range(100):
step = random_walk[-1]
dice = np.random.randint(1, 7)
if dice
step = max(0, step - 1)
elif dice
step = step + 1
else:
step = step + np.random.randint(1, 7)
if np.random.rand()
step = 0
random_walk.append(step)
# Append random_walk to all_walks
all_walks.append(random_walk)
# Print all_walks
print(all_walks)
Array transposed: changing rows to columns and vice-versa
np_aw_t = np.transpose(np_aw)
# Plot np_aw_t and show
plt.plot(np_aw_t)
Histogram for Walks
np_aw_t = np.transpose(np.array(all_walks))
# Select last row from np_aw_t: ends
ends = np_aw_t[-1, :]
# Plot histogram of ends, display plot
plt.hist(ends)
plt.show()
print(all_walks)
Intermediate Python
Tuples (are immutable, can’t be modified)
# define tuple
nums = (2, 3, 4)
a, b, c = nums # unpack
print(nums)
print(a)
Defining a function with tuples
def shout(a, b):
shout1 = a + "!!!"
shout2 = b + "!!!"
shout_words = (shout1, shout2) # pack
return shout_words
yell1, yell2 = shout("congratulations", "you") # unpack
print(yell1)
print(yell2)
Tweets in lang count
def tweet_count(name, col_name):
dic = {}
extract = name[col_name]
for entry in extract:
if entry in dic.keys():
dic[entry] += 1
else:
dic[entry] = 1
return dic
result = tweet_count(tweets_df, 'lang')
print(result)
Nested functions (return variable is the same for both functions)
def test(word1, word2, word3):
def inner(word):
return word + "!"
return inner(word1), inner(word2), inner(word3)
assignment = test("a", "b", "c")
-> inner(word1), inner(word2), inner(word3))
(tuple of functions – it’s just a note)
print(assignment)
Change the value of a non-local variable
# Define echo_shout()
def echo_shout(word):
"""Change the value of a nonlocal variable"""
# Concatenate word with itself: echo_word
echo_word = word + word
# Print echo_word
print(echo_word)
# Define inner function shout()
def shout():
"""Alter a variable in the enclosing scope"""
# Use echo_word in nonlocal scope
nonlocal echo_word
# Change echo_word to echo_word concatenated with '!!!'
echo_word = echo_word + "!!!"
# Call function shout()
shout()
# Print echo_word
print(echo_word)
# Call function echo_shout() with argument 'hello'
echo_shout("hello")
*arg
for defining multiple arguments (*args)
from a function, it will get values like (“A”, ”B”)
**kwargs
for passing key-value (name=”c”, age=23)
def count_entries(df, *args):
"""Return a dictionary with counts of
occurrences as value for each key."""
# Initialize an empty dictionary: cols_count
cols_count = {}
# Iterate over column names in args
for col_name in args:
# Extract column from DataFrame: col
col = df[col_name]
# Iterate over the column in DataFrame
for entry in col:
# If entry is in cols_count, add 1
if entry in cols_count.keys():
cols_count[entry] += 1
# Else add the entry to cols_count, set the value to 1
else:
cols_count[entry] = 1
# Return the cols_count dictionary
return cols_count
# Call count_entries(): result1
result1 = count_entries(tweets_df, 'lang')
# Call count_entries(): result2
result2 = count_entries(tweets_df, 'lang', 'source')
# Print result1 and result2
print(result1)
print(result2)
Lambda function
echo_word = lambda word1, echo: (word1 * echo)
# Call echo_word: result
result = echo_word('hey', 5)
# Print result
print(result)
Map
shout_spells = map(lambda item: item + "!!!", spells)
Filter
result = filter(lambda member: len(member) > 6, fellowship)
Reduce
from functools import reduce
result = reduce(lambda item1, item2: item1 + item2, stark)
def shout_echo(word1, echo=1):
"""Concatenate echo copies of word1 and three
exclamation marks at the end of the string."""
# Raise an error with raise
if echo
raise ValueError('echo must be greater than or equal to 0')
# Concatenate echo copies of word1 using *: echo_word
echo_word = word1 * echo
# Concatenate '!!!' to echo_word: shout_word
shout_word = echo_word + '!!!'
# Return shout_word
return shout_word
# Call shout_echo
shout_echo("particle", echo=5)
# Define count_entries()
def count_entries(df, col_name='lang'):
"""Return a dictionary with counts of
occurrences as value for each key."""
Another raise value example:
# Raise a ValueError if col_name is NOT in DataFrame
if col_name not in df.columns:
raise ValueError('The DataFrame does not have a ' + col_name + ' column.')
tesla_data.reset_index()
(to set row label to zero for a dataframe)
Visualizing Time Series
# Print the data type of each column in discoveries
print(discoveries.dtypes)
# Convert the date column to a datestamp type
discoveries['date'] = pd.to_datetime(discoveries['date'])
# Print the data type of each column in discoveries, again
print(discoveries.dtypes)
discoveries = discoveries.set_index('date')
# Plot the time series in your DataFrame
ax = discoveries.plot(color=’blue’,figsize=(8,3),linewidth=4,font6)
# Specify the x-axis label in your plot
ax.set_xlabel(‘Date’)
# Specify the y-axis label in your plot
ax.set_ylabel(‘Number of great discoveries’)
# Show plot
plt.show()
plt.style.use()
print(plt.style.available)
subplot time series
subplot=sf[from : to]
ax=sublot.plt(color=blue)
ax.axvline(date,linestyle=’–’) horizontal line
ax.axhline (8) vertical line
ax.axvspan(date,date2) horizontal span
ax.axhspan(6,8) vertical span
df.notnull()
df.isnull().sum() df.fillna(method=’bfill or ffill’
rolling mean
# Compute the 52 weeks rolling mean of the co2_levels DataFrame
ma = co2_levels.rolling(window=52).mean()
# Compute the 52 weeks rolling standard deviation of the co2_levels DataFrame
mstd = co2_levels.rolling(window=52).std()
# Add the upper bound column to the ma DataFrame
ma[‘upper’] = ma[‘co2’] + (2 * mstd[‘co2’])
# Add the lower bound column to the ma DataFrame
ma[‘lower’] = ma[‘co2’] – (2 * mstd[‘co2’])
# Plot the content of the ma DataFrame
ax = ma.plot(linewidth=0.8, fontsize=6)
# Specify labels, legend, and show the plot
ax.set_xlabel(‘Date’, font0)
ax.set_ylabel(‘CO2 levels in Mauai Hawaii’, font0)
ax.set_title(‘Rolling mean and variance of CO2 levels\nin Mauai Hawaii from 1958 to 2001’, font0)
plt.show()
print(ma)
mean_co2_levels_by_month = co2_levels.groupby(index_month).mean()
dataframe.plot(kind=density/hist and for histogram set bins)
dataframe.plot.area(figsize,fontsize,linewidth) colormap=’Dark2’ it is used to give different colors to areamap.
Add summary statistics to plot
ax = meat.plot(fontsize=6,linewidth=1)
# Add x-axis labels
ax.set_xlabel(‘Date’,fontsize=6)
# Add summary table information to the plot
ax.table(cellText=meat_mean.values,
colWidths = [0.15]*len(meat_mean.columns),
rowLabels=meat_mean.index,
colLabels=meat_mean.columns,
loc=’top’)
# Specify the fontsize and location of your legend
ax.legend(loc=’upper center’,bbox_to_anchor=(0.5,0.95),ncol=3,fontsize=6)
facet plot
df.plot(subplots=True, linewidth=0.5, layout=(2, 4), figsize=(16, 10), sharex=False, sharey=False)plt.show()
correlations
from scipy.stats.stats import pearsonr
from scipy.stats.stats import spearmanr
from scipy.stats.stats import kendalltau
how to find correlation btw columns
meat[[‘column1,columns2]].corr(method=’’)
import seaborn as sns
sns.heatmap(corr_mat) or clustermap
sns.heatmap(corr_meat,
annot=True,
linewidths=0.4,
annot_kws={“size”:10})
plt.xticks(rotation=90)
plt.yticks(rotation=0)
corr_meat = meat.corr(method=’pearson’)
# Customize the heatmap of the corr_meat correlation matrix and rotate the x-axis labels
fig = sns.clustermap(corr_meat,
row_cluster=True,
col_cluster=True,
figsize=(10,10))
plt.setp(fig.ax_heatmap.xaxis.get_majorticklabels(),rotation=90)
plt.setp(fig.ax_heatmap.yaxis.get_majorticklabels(),rotation=0)
plt.show()
Intro to visualization with matplotlib
Subplotting
fig, ax = plt.subplots()
# Plot MLY-PRCP-NORMAL from seattle_weather against the MONTH
ax.plot(seattle_weather[“MONTH”], seattle_weather[‘MLY-PRCP-NORMAL’])
# Plot MLY-PRCP-NORMAL from austin_weather against MONTH
ax.plot(austin_weather[‘MONTH’], austin_weather[‘MLY-PRCP-NORMAL’],color=’b’,linestyle=’–’,marker=’o or cab b v’)
# Call the show function
plt.show()
when you want specific rows and columns for subplotting and also if you want same y axis range then make it true.
fig,ax=plt.subplots(2,1,sharey=True)
ax[0].plot(dataframe[‘date’],dataframe[‘co2’])
climate_change = pd.read_csv(‘climate_change.csv’,parse_dates=[“date”],index_col=’date’)
Joining data with pandas
Inner join
taxi_own_veh = taxi_owners.merge(taxi_veh, on=’vid’, suffixes=(‘_own’,’_veh’))
taxi_own_veh[‘fuel_type’].value_counts())
counted_df = licenses_owners.groupby(“title”).agg({‘account’:’count’})
# Sort the counted_df in desending order
sorted_df = counted_df.sort_values(“account”,ascending=False)
ridership_cal = ridership.merge(cal,on =[“year”,”month”,”day”])
three table merge
licenses_zip_ward = licenses.merge(zip_demo, on=”zip”)\
.merge(wards, on=”ward”)
# Print the results by alderman and show median income
print(licenses_zip_ward.groupby(“alderman”).agg({‘income’:’median’}))
group and sorting by multiple values
pop_vac_lic = land_cen_lic.groupby([‘ward’,’pop_2010′,’vacant’],
as_index=False).agg({‘account’:’count’})
# Sort pop_vac_lic and print the results
sorted_pop_vac_lic = pop_vac_lic.sort_values([‘vacant’,’account’,’pop_2010′],
ascending=[False,True,True])
visualization of merged tables
grant_licenses_ward.groupby(‘ward’).agg(‘sum’).plot(kind=’bar’, y=’grant’)plt.show()
left join
movies_financials = movies.merge(financials, on=’id’, how=’left’)
# Count the number of rows in the budget column that are missing
number_of_missing_fin = movies_financials[‘budget’].isna().sum()
right join
action_scifi = action_movies.merge(scifi_movies, on=’movie_id’, how=’right’,
suffixes=(‘_act’,’_sci’))
# From action_scifi, select only the rows where the genre_act column is null
scifi_only = action_scifi[action_scifi[‘genre_act’].isnull()]
# Merge the movies and scifi_only tables with an inner join
movies_and_scifi_only = movies.merge(scifi_only,left_on=”id”,right_on=”movie_id”)
outer join
# Merge iron_1_actors to iron_2_actors on id with outer join using suffixes
iron_1_and_2 = iron_1_actors.merge(iron_2_actors,
on=”id”,
how=”outer”,
suffixes=(“_1″,”_2”))
# Create an index that returns true if name_1 or name_2 are null
m = ((iron_1_and_2[‘name_1’].isna()) |
(iron_1_and_2[‘name_2’].isna()))
# Print the first few rows of iron_1_and_2
print(iron_1_and_2[m].head())
self join with right/left choice
original_sequels = sequels.merge(sequels, left_on=’sequel’, right_on=’id’, how=’left’, suffixes=(‘_org’,’_seq’)
for join on index you have to set required column as row index
samuel_casts = samuel.merge(casts, on=[‘movie_id’,‘cast_id’])
pd.read_csv(‘samuel.csv”,index_col[‘id’]
semi join(like inner join but return column from left table not the right)
tracks_invoices = non_mus_tcks.merge(top_invoices, on=’tid’)
# Use .isin() to subset non_mus_tcks to rows with tid in tracks_invoices
top_tracks = non_mus_tcks[non_mus_tcks[‘tid’].isin(tracks_invoices[‘tid’])]
# Group the top_tracks by gid and count the tid rows
cnt_by_gid = top_tracks.groupby([‘gid’], as_index=False).agg({‘tid’:’count’})
# Merge the genres table to cnt_by_gid on gid and print
print(cnt_by_gid.merge(genres, on=’gid’))
anti join(return left table without intersection)
# Merge employees and top_cust
empl_cust = employees.merge(top_cust, on=’srid’,
how=’left’, indicator=True)
# Select the srid column where _merge is left_only
srid_list = empl_cust.loc[empl_cust[‘_merge’] == ‘left_only’, ‘srid’]
# Get employees not working with top customers
print(employees[employees[‘srid’].isin(srid_list)])
concat different dataframes (by default vertical, axis=1 is horizontal)
pd.concat([inv_jan, inv_feb, inv_mar], ignore_index=False, keys=[‘jan’,’feb’,’mar’])
sort=True(in this way columns labels get sorted)
join=inner(only those columns will show that present in both dataframes)
keys is original table label and ignore index true is like continuous indexing and false is not continuos indexing.take orignal index
group by keys
avg_inv_by_month = inv_jul_thr_sep.groupby(level=0).agg({‘total’:’mean’})
verify merge table relationship
table1.merge(table 2, on=’’, validate= type of relationship)
‘one_to_one”one_to_many”many_to_one”many_to_many’
Pd.concat([df1,df2],verify_integrity=True) check for duplicate rows
Concatenation+merge
# Concatenate the classic tables vertically
classic_18_19 = pd.concat([classic_18,classic_19],ignore_index=True)
print(classic_18_19)
# Concatenate the pop tables vertically
pop_18_19 = pd.concat([pop_18,pop_19],ignore_index=True)
print(pop_18_19)
# Merge classic_18_19 with pop_18_19
classic_pop = classic_18_19.merge(pop_18_19,on=’tid’)
# Using .isin(), filter classic_18_19 rows where tid is in classic_pop
popular_classic = classic_18_19[classic_18_19[‘tid’].isin(classic_pop[‘tid’])]
# Print popular chart
print(popular_classic)
merge_ordered(the column on which you will be merging it will set in ascending order and default merge is outer)
pd.merge_ordered(appl, mcd, on=’date’, suffixes=(‘_aapl’,’_mcd’), fill_method=’ffill’)
date_ctry = pd.merge_ordered(gdp,pop,on=(‘country’,’date’),
fill_method=’ffill’)
correlation using merge_ordered
gdp_sp500 = pd.merge_ordered(gdp, sp500, left_on=’year’, right_on=’date’,
how=’left’, fill_method=’ffill’)
# Subset the gdp and returns columns
gdp_returns = gdp_sp500[[‘gdp’,’returns’]]
# Print gdp_returns correlation
print (gdp_returns.corr())
merge_asof()merge on nearest key columns not exact macthes.it is similar to merge ordered as left join
# Use merge_asof() to merge jpm and wells
jpm_wells = pd.merge_asof(jpm,wells,on=’date_time’,suffixes=(”,’_wells’),direction=’nearest’)
# Use merge_asof() to merge jpm_wells and bac
jpm_wells_bac = pd.merge_asof(jpm_wells,bac,on=’date_time’,suffixes=(‘_jpm’,’_bac’),direction=’nearest’)
# Compute price diff
price_diffs = jpm_wells_bac.diff()
# Plot the price diff of the close of jpm, wells and bac only
price_diffs.plot(y=[‘close_jpm’,’close_wells’,’close_bac’])
plt.show()
# Merge gdp and recession on date using merge_asof()
gdp_recession = pd.merge_asof(gdp,recession,on=’date’)
# Create a list based on the row value of gdp_recession[‘econ_status’]
is_recession = [‘r’ifs==’recession’else’g’forsingdp_recession[‘econ_status’]]
# Plot a bar chart of gdp_recession
gdp_recession.plot(kind=’bar’,y=’gdp’,x=’date’,color=is_recession,rot=90)
plt.show()
pivot table and then query
# Merge gdp and pop on date and country with fill
gdp_pop = pd.merge_ordered(gdp, pop, on=[‘country’,’date’], fill_method=’ffill’)
# Add a column named gdp_per_capita to gdp_pop that divides the gdp by pop
gdp_pop[‘gdp_per_capita’] = gdp_pop[‘gdp’] / gdp_pop[‘pop’]
print(gdp_pop)
# Pivot data so gdp_per_capita, where index is date and columns is country
gdp_pivot = gdp_pop.pivot_table(‘gdp_per_capita’, ‘date’, ‘country’)
pivot(index,columns,value) convert value from rows to columns
print(gdp_pivot)
# Select dates equal to or greater than 1991-01-01
recent_gdp_pop = gdp_pivot.query(‘date>=”1991-01-01″‘)
# Plot recent_gdp_pop
recent_gdp_pop.plot(rot=90)
plt.show()
query usage
stocks_long.query(‘stock==”disney” or (stock==”nike” and close
melt usage to make a dataframe in long format (convert values from column to rows)
social_fin_tall = social_fin.melt(id_vars=[‘financial’,’company’], value_vars=[‘2018′,’2017’], var_name=[‘year’], value_name=’dollars’)print(social_fin_tall.head(8))
# Use melt on ten_yr, unpivot everything besides the metric column
bond_perc = ten_yr.melt(id_vars=’metric’,var_name=’date’,value_name=’close’)
# Use query on bond_perc to select only the rows where metric=close
bond_perc_close = bond_perc.query(‘metric==”close”‘)
# Merge (ordered) dji and bond_perc_close on date with an inner join
dow_bond = pd.merge_ordered(dji,bond_perc_close,on=’date’,how=’inner’,suffixes=(‘_dow’,’_bond’))
# Plot only the close_dow and close_bond columns
dow_bond.plot(y=[‘close_dow’,’close_bond’],x=’date’,rot=90)
plt.show()
ur_tall = ur_wide.melt(id_vars=[‘year’], var_name=[‘month’], value_name=’unempl_rate’)
# Create a date column using the month and year columns of ur_tall
ur_tall[‘date’] = pd.to_datetime(ur_tall[‘year’] + ‘-‘ +ur_tall[‘month’] )
# Sort ur_tall by date in ascending order
ur_sorted = ur_tall.sort_values(“date”)
# Plot the unempl_rate by date
ur_sorted.plot(kind=’bar’,y=’unempl_rate’,x=’date’)
plt.show()