Import the relevant packages

In [1]:
import pandas as pd
import plotnine as p9
from plotnine import *

Question 1: Read in the data frame

In [2]:
df = pd.DataFrame()
for i in range(1980, 2015):
    yearpart = pd.read_csv("~/Downloads/Names/yob" + str(i) + ".txt", header=None)
    yearpart['year'] = i
    df = df.append(yearpart)
In [3]:
df.columns = ['Name', 'Gender', 'Count', 'Year']
In [4]:
df.shape
Out[4]:
(961695, 4)

Question 2: Plot the popularity of your own name over the years

In [5]:
my_name = df[df.Name=="Ishmael"]
In [9]:
ggplot(my_name, aes("Year", "Count")) + geom_line() + ylim(0,200)
Out[9]:
<ggplot: (-9223363243896804643)>

Question 3: Visualize the growth in girl names from 1880-2014

In [49]:
girl_names = df[df.Gender=="F"]
In [68]:
year_count = pd.DataFrame(girl_names.groupby('Year').Name.count()).reset_index()
In [72]:
ggplot(year_count, aes("Year", "Name")) + geom_line()
Out[72]:
<ggplot: (8794167791680)>

Question 4: Write a function to generate the most popular names for a given gender and year

In [86]:
def toptennames(gender, year):
    ttn = df[(df.Year==year) & (df.Gender==gender)]
    ttn = ttn.sort_values("Count", ascending=False)
    return(ttn.head(10))
In [88]:
toptennames("F", 1990)
Out[88]:
Name Gender Count Year
0 Jessica F 46466 1990
1 Ashley F 45549 1990
2 Brittany F 36535 1990
3 Amanda F 34406 1990
4 Samantha F 25864 1990
5 Sarah F 25808 1990
6 Stephanie F 24856 1990
7 Jennifer F 22221 1990
8 Elizabeth F 20742 1990
9 Lauren F 20498 1990

Question 5: Compute the names for whom the number of boys+girls with the name is the largest for 2014

In [12]:
names_2014 = df[df.Year == 2014]
names_2014.drop(['Year'], axis=1, inplace=True)
In [13]:
names_2014_male = names_2014[names_2014.Gender=="M"]
names_2014_female = names_2014[names_2014.Gender=="F"]
In [14]:
names_2014 = names_2014_male.merge(names_2014_female, 
                                   on='Name', 
                                   how='outer',
                                   suffixes=['_male', '_female'])
names_2014 = names_2014.fillna(0)
In [15]:
names_2014['total'] = names_2014.Count_male - names_2014.Count_female
names_2014['abstotal'] = names_2014.total.abs()
names_2014 = names_2014.sort_values('abstotal', ascending=True)
names_2014 = names_2014[(names_2014.Count_male + names_2014.Count_female) > 1000]
In [16]:
names_2014.head(10)
Out[16]:
Name Gender_male Count_male Gender_female Count_female total abstotal
351 Skyler M 911.0 F 1070.0 -159.0 159.0
530 Justice M 518.0 F 756.0 -238.0 238.0
224 Charlie M 1670.0 F 1432.0 238.0 238.0
359 Dakota M 876.0 F 1136.0 -260.0 260.0
354 Phoenix M 901.0 F 629.0 272.0 272.0
410 Milan M 748.0 F 424.0 324.0 324.0
591 Tatum M 462.0 F 828.0 -366.0 366.0
339 Amari M 970.0 F 585.0 385.0 385.0
415 Rory M 741.0 F 326.0 415.0 415.0
653 Sage M 399.0 F 834.0 -435.0 435.0