https://www.kaggle.com/rounakbanik/ted-talks
import pandas as pd pd.__version__
import matplotlib.pyplot as plt %matplotlib inline
ted = pd.read_csv('ted.csv')
# each row represents a single talk ted.head()
# rows, columns ted.shape
(2550, 17)
# object columns are usually strings, but can also be arbitrary Python objects (lists, dictionaries) ted.dtypes
comments int64 description object duration int64 event object film_date int64 languages int64 main_speaker object name object num_speaker int64 published_date int64 ratings object related_talks object speaker_occupation object tags object title object url object views int64 dtype: object
# count the number of missing values in each column ted.isna().sum()
comments 0 description 0 duration 0 event 0 film_date 0 languages 0 main_speaker 0 name 0 num_speaker 0 published_date 0 ratings 0 related_talks 0 speaker_occupation 6 tags 0 title 0 url 0 views 0 dtype: int64
# sort by the number of first-level comments, though this is biased in favor of older talks ted.sort_values('comments').tail()
# correct for this bias by calculating the number of comments per view ted['comments_per_view'] = ted.comments / ted.views
# interpretation: for every view of the same-sex marriage talk, there are 0.002 comments ted.sort_values('comments_per_view').tail()
# make this more interpretable by inverting the calculation ted['views_per_comment'] = ted.views / ted.comments
# interpretation: 1 out of every 450 people leave a comment ted.sort_values('views_per_comment').head()
Lessons:
# line plot is not appropriate here (use it to measure something over time) ted.comments.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1098cc0f0>
# histogram shows the frequency distribution of a single numeric variable ted.comments.plot(kind='hist')
<matplotlib.axes._subplots.AxesSubplot at 0x11a501ba8>
# modify the plot to be more informative ted[ted.comments < 1000].comments.plot(kind='hist')
<matplotlib.axes._subplots.AxesSubplot at 0x11a5184e0>
# check how many observations we removed from the plot ted[ted.comments >= 1000].shape
(32, 19)
# can also write this using the query method ted.query('comments < 1000').comments.plot(kind='hist')
<matplotlib.axes._subplots.AxesSubplot at 0x11ac4a2b0>
# can also write this using the loc accessor ted.loc[ted.comments < 1000, 'comments'].plot(kind='hist')
<matplotlib.axes._subplots.AxesSubplot at 0x11a527d30>
# increase the number of bins to see more detail ted.loc[ted.comments < 1000, 'comments'].plot(kind='hist', bins=20)
<matplotlib.axes._subplots.AxesSubplot at 0x11aec0278>
# boxplot can also show distributions, but it's far less useful for concentrated distributions because of outliers ted.loc[ted.comments < 1000, 'comments'].plot(kind='box')
<matplotlib.axes._subplots.AxesSubplot at 0x11afe42b0>
Bonus exercise: calculate the average delay between filming and publishing
# event column does not always include the year ted.event.sample(10)
2012 TEDxBoulder 1307 TEDxUCL 144 TEDGlobal 2007 1739 TED2014 1529 TEDGlobal 2013 1181 TEDxWomen 2011 2150 TEDYouth 2015 1719 TED2014 64 TED2007 1178 TEDxCambridge Name: event, dtype: object
# dataset documentation for film_date says "Unix timestamp of the filming" ted.film_date.head()
0 1140825600 1 1140825600 2 1140739200 3 1140912000 4 1140566400 Name: film_date, dtype: int64
# results don't look right pd.to_datetime(ted.film_date).head()
0 1970-01-01 00:00:01.140825600 1 1970-01-01 00:00:01.140825600 2 1970-01-01 00:00:01.140739200 3 1970-01-01 00:00:01.140912000 4 1970-01-01 00:00:01.140566400 Name: film_date, dtype: datetime64[ns]
pandas documentation for to_datetime
to_datetime
# now the results look right pd.to_datetime(ted.film_date, unit='s').head()
0 2006-02-25 1 2006-02-25 2 2006-02-24 3 2006-02-26 4 2006-02-22 Name: film_date, dtype: datetime64[ns]
ted['film_datetime'] = pd.to_datetime(ted.film_date, unit='s')
# verify that event name matches film_datetime for a random sample ted[['event', 'film_datetime']].sample(5)
# new column uses the datetime data type (this was an automatic conversion) ted.dtypes
comments int64 description object duration int64 event object film_date int64 languages int64 main_speaker object name object num_speaker int64 published_date int64 ratings object related_talks object speaker_occupation object tags object title object url object views int64 comments_per_view float64 views_per_comment float64 film_datetime datetime64[ns] dtype: object
# datetime columns have convenient attributes under the dt namespace ted.film_datetime.dt.year.head()
0 2006 1 2006 2 2006 3 2006 4 2006 Name: film_datetime, dtype: int64
# similar to string methods under the str namespace ted.event.str.lower().head()
0 ted2006 1 ted2006 2 ted2006 3 ted2006 4 ted2006 Name: event, dtype: object
# count the number of talks each year using value_counts() ted.film_datetime.dt.year.value_counts()
2013 270 2011 270 2010 267 2012 267 2016 246 2015 239 2014 237 2009 232 2007 114 2017 98 2008 84 2005 66 2006 50 2003 33 2004 33 2002 27 1998 6 2001 5 1983 1 1991 1 1994 1 1990 1 1984 1 1972 1 Name: film_datetime, dtype: int64
# points are plotted and connected in the order you give them to pandas ted.film_datetime.dt.year.value_counts().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x11a477278>
# need to sort the index before plotting ted.film_datetime.dt.year.value_counts().sort_index().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x11b48dc18>
# we only have partial data for 2017 ted.film_datetime.max()
Timestamp('2017-08-27 00:00:00')
# count the number of talks (great if you value variety, but they may not be great talks) ted.event.value_counts().head()
TED2014 84 TED2009 83 TED2013 77 TED2016 77 TED2015 75 Name: event, dtype: int64
# use views as a proxy for "quality of talk" ted.groupby('event').views.mean().head()
event AORN Congress 149818.0 Arbejdsglaede Live 971594.0 BBC TV 521974.0 Bowery Poetry Club 676741.0 Business Innovation Factory 304086.0 Name: views, dtype: float64
# find the largest values, but we don't know how many talks are being averaged ted.groupby('event').views.mean().sort_values().tail()
event TEDxNorrkoping 6569493.0 TEDxCreativeCoast 8444981.0 TEDxBloomington 9484259.5 TEDxHouston 16140250.5 TEDxPuget Sound 34309432.0 Name: views, dtype: float64
# show the number of talks along with the mean (events with the highest means had only 1 or 2 talks) ted.groupby('event').views.agg(['count', 'mean']).sort_values('mean').tail()
# calculate the total views per event ted.groupby('event').views.agg(['count', 'mean', 'sum']).sort_values('sum').tail()
# previously, users could tag talks on the TED website (funny, inspiring, confusing, etc.) ted.ratings.head()
0 [{'id': 7, 'name': 'Funny', 'count': 19645}, {... 1 [{'id': 7, 'name': 'Funny', 'count': 544}, {'i... 2 [{'id': 7, 'name': 'Funny', 'count': 964}, {'i... 3 [{'id': 3, 'name': 'Courageous', 'count': 760}... 4 [{'id': 9, 'name': 'Ingenious', 'count': 3202}... Name: ratings, dtype: object
# two ways to examine the ratings data for the first talk ted.loc[0, 'ratings'] ted.ratings[0]
"[{'id': 7, 'name': 'Funny', 'count': 19645}, {'id': 1, 'name': 'Beautiful', 'count': 4573}, {'id': 9, 'name': 'Ingenious', 'count': 6073}, {'id': 3, 'name': 'Courageous', 'count': 3253}, {'id': 11, 'name': 'Longwinded', 'count': 387}, {'id': 2, 'name': 'Confusing', 'count': 242}, {'id': 8, 'name': 'Informative', 'count': 7346}, {'id': 22, 'name': 'Fascinating', 'count': 10581}, {'id': 21, 'name': 'Unconvincing', 'count': 300}, {'id': 24, 'name': 'Persuasive', 'count': 10704}, {'id': 23, 'name': 'Jaw-dropping', 'count': 4439}, {'id': 25, 'name': 'OK', 'count': 1174}, {'id': 26, 'name': 'Obnoxious', 'count': 209}, {'id': 10, 'name': 'Inspiring', 'count': 24924}]"
# this is a string not a list type(ted.ratings[0])
str
# convert this into something useful using Python's ast module (Abstract Syntax Tree) import ast
# literal_eval() allows you to evaluate a string containing a Python literal or container ast.literal_eval('[1, 2, 3]')
[1, 2, 3]
# if you have a string representation of something, you can retrieve what it actually represents type(ast.literal_eval('[1, 2, 3]'))
list
# unpack the ratings data for the first talk ast.literal_eval(ted.ratings[0])
[{'id': 7, 'name': 'Funny', 'count': 19645}, {'id': 1, 'name': 'Beautiful', 'count': 4573}, {'id': 9, 'name': 'Ingenious', 'count': 6073}, {'id': 3, 'name': 'Courageous', 'count': 3253}, {'id': 11, 'name': 'Longwinded', 'count': 387}, {'id': 2, 'name': 'Confusing', 'count': 242}, {'id': 8, 'name': 'Informative', 'count': 7346}, {'id': 22, 'name': 'Fascinating', 'count': 10581}, {'id': 21, 'name': 'Unconvincing', 'count': 300}, {'id': 24, 'name': 'Persuasive', 'count': 10704}, {'id': 23, 'name': 'Jaw-dropping', 'count': 4439}, {'id': 25, 'name': 'OK', 'count': 1174}, {'id': 26, 'name': 'Obnoxious', 'count': 209}, {'id': 10, 'name': 'Inspiring', 'count': 24924}]
# now we have a list (of dictionaries) type(ast.literal_eval(ted.ratings[0]))
# define a function to convert an element in the ratings Series from string to list def str_to_list(ratings_str): return ast.literal_eval(ratings_str)
# test the function str_to_list(ted.ratings[0])
# Series apply method applies a function to every element in a Series and returns a Series ted.ratings.apply(str_to_list).head()
# lambda is a shorter alternative ted.ratings.apply(lambda x: ast.literal_eval(x)).head()
# an even shorter alternative is to apply the function directly (without lambda) ted.ratings.apply(ast.literal_eval).head()
ted['ratings_list'] = ted.ratings.apply(lambda x: ast.literal_eval(x))
# check that the new Series looks as expected ted.ratings_list[0]
# each element in the Series is a list type(ted.ratings_list[0])
# data type of the new Series is object ted.ratings_list.dtype
dtype('O')
# object is not just for strings ted.dtypes
comments int64 description object duration int64 event object film_date int64 languages int64 main_speaker object name object num_speaker int64 published_date int64 ratings object related_talks object speaker_occupation object tags object title object url object views int64 comments_per_view float64 views_per_comment float64 film_datetime datetime64[ns] ratings_list object dtype: object
Bonus exercises:
# expected result (for each talk) is sum of count ted.ratings_list[0]
# start by building a simple function def get_num_ratings(list_of_dicts): return list_of_dicts[0]
# pass it a list, and it returns the first element in the list, which is a dictionary get_num_ratings(ted.ratings_list[0])
{'id': 7, 'name': 'Funny', 'count': 19645}
# modify the function to return the vote count def get_num_ratings(list_of_dicts): return list_of_dicts[0]['count']
# pass it a list, and it returns a value from the first dictionary in the list get_num_ratings(ted.ratings_list[0])
19645
# modify the function to get the sum of count def get_num_ratings(list_of_dicts): num = 0 for d in list_of_dicts: num = num + d['count'] return num
# looks about right get_num_ratings(ted.ratings_list[0])
93850
# check with another record ted.ratings_list[1]
[{'id': 7, 'name': 'Funny', 'count': 544}, {'id': 3, 'name': 'Courageous', 'count': 139}, {'id': 2, 'name': 'Confusing', 'count': 62}, {'id': 1, 'name': 'Beautiful', 'count': 58}, {'id': 21, 'name': 'Unconvincing', 'count': 258}, {'id': 11, 'name': 'Longwinded', 'count': 113}, {'id': 8, 'name': 'Informative', 'count': 443}, {'id': 10, 'name': 'Inspiring', 'count': 413}, {'id': 22, 'name': 'Fascinating', 'count': 132}, {'id': 9, 'name': 'Ingenious', 'count': 56}, {'id': 24, 'name': 'Persuasive', 'count': 268}, {'id': 23, 'name': 'Jaw-dropping', 'count': 116}, {'id': 26, 'name': 'Obnoxious', 'count': 131}, {'id': 25, 'name': 'OK', 'count': 203}]
# looks about right get_num_ratings(ted.ratings_list[1])
2936
# apply it to every element in the Series ted.ratings_list.apply(get_num_ratings).head()
0 93850 1 2936 2 2824 3 3728 4 25620 Name: ratings_list, dtype: int64
# another alternative is to use a generator expression sum((d['count'] for d in ted.ratings_list[0]))
# use lambda to apply this method ted.ratings_list.apply(lambda x: sum((d['count'] for d in x))).head()
# another alternative is to use pd.DataFrame() pd.DataFrame(ted.ratings_list[0])['count'].sum()
# use lambda to apply this method ted.ratings_list.apply(lambda x: pd.DataFrame(x)['count'].sum()).head()
ted['num_ratings'] = ted.ratings_list.apply(get_num_ratings)
# do one more check ted.num_ratings.describe()
count 2550.000000 mean 2436.408235 std 4226.795631 min 68.000000 25% 870.750000 50% 1452.500000 75% 2506.750000 max 93850.000000 Name: num_ratings, dtype: float64
# "Funny" is not always the first dictionary in the list ted.ratings_list.head()
0 [{'id': 7, 'name': 'Funny', 'count': 19645}, {... 1 [{'id': 7, 'name': 'Funny', 'count': 544}, {'i... 2 [{'id': 7, 'name': 'Funny', 'count': 964}, {'i... 3 [{'id': 3, 'name': 'Courageous', 'count': 760}... 4 [{'id': 9, 'name': 'Ingenious', 'count': 3202}... Name: ratings_list, dtype: object
# check ratings (not ratings_list) to see if "Funny" is always a rating type ted.ratings.str.contains('Funny').value_counts()
True 2550 Name: ratings, dtype: int64
# write a custom function def get_funny_ratings(list_of_dicts): for d in list_of_dicts: if d['name'] == 'Funny': return d['count']
# examine a record in which "Funny" is not the first dictionary ted.ratings_list[3]
[{'id': 3, 'name': 'Courageous', 'count': 760}, {'id': 1, 'name': 'Beautiful', 'count': 291}, {'id': 2, 'name': 'Confusing', 'count': 32}, {'id': 7, 'name': 'Funny', 'count': 59}, {'id': 9, 'name': 'Ingenious', 'count': 105}, {'id': 21, 'name': 'Unconvincing', 'count': 36}, {'id': 11, 'name': 'Longwinded', 'count': 53}, {'id': 8, 'name': 'Informative', 'count': 380}, {'id': 10, 'name': 'Inspiring', 'count': 1070}, {'id': 22, 'name': 'Fascinating', 'count': 132}, {'id': 24, 'name': 'Persuasive', 'count': 460}, {'id': 23, 'name': 'Jaw-dropping', 'count': 230}, {'id': 26, 'name': 'Obnoxious', 'count': 35}, {'id': 25, 'name': 'OK', 'count': 85}]
# check that the function works get_funny_ratings(ted.ratings_list[3])
59
# apply it to every element in the Series ted['funny_ratings'] = ted.ratings_list.apply(get_funny_ratings) ted.funny_ratings.head()
0 19645 1 544 2 964 3 59 4 1390 Name: funny_ratings, dtype: int64
# check for missing values ted.funny_ratings.isna().sum()
0
ted['funny_rate'] = ted.funny_ratings / ted.num_ratings
# "gut check" that this calculation makes sense by examining the occupations of the funniest talks ted.sort_values('funny_rate').speaker_occupation.tail(20)
1849 Science humorist 337 Comedian 124 Performance poet, multimedia artist 315 Expert 1168 Social energy entrepreneur 1468 Ornithologist 595 Comedian, voice artist 1534 Cartoon editor 97 Satirist 2297 Actor, writer 568 Comedian 675 Data scientist 21 Humorist, web artist 194 Jugglers 2273 Comedian and writer 2114 Comedian and writer 173 Investor 747 Comedian 1398 Comedian 685 Actor, comedian, playwright Name: speaker_occupation, dtype: object
# examine the occupations of the least funny talks ted.sort_values('funny_rate').speaker_occupation.head(20)
2549 Game designer 1612 Biologist 612 Sculptor 998 Penguin expert 593 Engineer 284 Space activist 1041 Biomedical engineer 1618 Spinal cord researcher 2132 Computational geneticist 442 Sculptor 426 Author, thinker 458 Educator 2437 Environmental engineer 1491 Photojournalist 1893 Forensic anthropologist 783 Marine biologist 195 Kenyan MP 772 HIV/AIDS fighter 788 Building activist 936 Neuroengineer Name: speaker_occupation, dtype: object
# calculate the mean funny rate for each occupation ted.groupby('speaker_occupation').funny_rate.mean().sort_values().tail()
speaker_occupation Comedian 0.512457 Actor, writer 0.515152 Actor, comedian, playwright 0.558107 Jugglers 0.566828 Comedian and writer 0.602085 Name: funny_rate, dtype: float64
# however, most of the occupations have a sample size of 1 ted.speaker_occupation.describe()
count 2544 unique 1458 top Writer freq 45 Name: speaker_occupation, dtype: object
# count how many times each occupation appears ted.speaker_occupation.value_counts()
Writer 45 Artist 34 Designer 34 Journalist 33 Entrepreneur 31 Architect 30 Inventor 27 Psychologist 26 Photographer 25 Filmmaker 21 Author 20 Economist 20 Neuroscientist 20 Educator 20 Roboticist 16 Philosopher 16 Biologist 15 Physicist 14 Musician 11 Marine biologist 11 Technologist 10 Activist 10 Global health expert; data visionary 10 Historian 9 Singer/songwriter 9 Oceanographer 9 Behavioral economist 9 Poet 9 Astronomer 9 Graphic designer 9 .. Anatomical artist 1 Literary scholar 1 Social entrepreneur, lawyer 1 Physician, bioengineer and entrepreneur 1 medical inventor 1 Mental health advocate 1 Public sector researcher 1 Speleologist 1 Disaster relief expert 1 Artist and curator 1 Finance journalist 1 Wildlife conservationist 1 Sex worker and activist 1 Connector 1 Sociologist, human rights activist 1 Author, producer 1 Painter 1 Policy expert 1 Environmental economist 1 Sound artist, composer 1 Senator 1 High school principal 1 Poet of code 1 Healthcare revolutionary 1 Circular economy advocate 1 Caregiver 1 Transportation geek 1 Music icon 1 Surprisologist 1 Psychiatrist and writer 1 Name: speaker_occupation, Length: 1458, dtype: int64
# value_counts() outputs a pandas Series, thus we can use pandas to manipulate the output occupation_counts = ted.speaker_occupation.value_counts() type(occupation_counts)
pandas.core.series.Series
# show occupations which appear at least 5 times occupation_counts[occupation_counts >= 5]
Writer 45 Artist 34 Designer 34 Journalist 33 Entrepreneur 31 Architect 30 Inventor 27 Psychologist 26 Photographer 25 Filmmaker 21 Author 20 Economist 20 Neuroscientist 20 Educator 20 Roboticist 16 Philosopher 16 Biologist 15 Physicist 14 Musician 11 Marine biologist 11 Technologist 10 Activist 10 Global health expert; data visionary 10 Historian 9 Singer/songwriter 9 Oceanographer 9 Behavioral economist 9 Poet 9 Astronomer 9 Graphic designer 9 .. Legal activist 6 Photojournalist 6 Evolutionary biologist 6 Singer-songwriter 6 Performance poet, multimedia artist 6 Climate advocate 6 Techno-illusionist 6 Social entrepreneur 6 Comedian 6 Reporter 6 Writer, activist 6 Investor and advocate for moral leadership 5 Surgeon 5 Paleontologist 5 Physician 5 Tech visionary 5 Chef 5 Science writer 5 Game designer 5 Cartoonist 5 Producer 5 Violinist 5 Researcher 5 Social Media Theorist 5 Environmentalist, futurist 5 Data scientist 5 Musician, activist 5 Sculptor 5 Chemist 5 Sound consultant 5 Name: speaker_occupation, Length: 68, dtype: int64
# save the index of this Series top_occupations = occupation_counts[occupation_counts >= 5].index top_occupations
Index(['Writer', 'Artist', 'Designer', 'Journalist', 'Entrepreneur', 'Architect', 'Inventor', 'Psychologist', 'Photographer', 'Filmmaker', 'Author', 'Economist', 'Neuroscientist', 'Educator', 'Roboticist', 'Philosopher', 'Biologist', 'Physicist', 'Musician', 'Marine biologist', 'Technologist', 'Activist', 'Global health expert; data visionary', 'Historian', 'Singer/songwriter', 'Oceanographer', 'Behavioral economist', 'Poet', 'Astronomer', 'Graphic designer', 'Philanthropist', 'Novelist', 'Social psychologist', 'Engineer', 'Computer scientist', 'Futurist', 'Astrophysicist', 'Mathematician', 'Legal activist', 'Photojournalist', 'Evolutionary biologist', 'Singer-songwriter', 'Performance poet, multimedia artist', 'Climate advocate', 'Techno-illusionist', 'Social entrepreneur', 'Comedian', 'Reporter', 'Writer, activist', 'Investor and advocate for moral leadership', 'Surgeon', 'Paleontologist', 'Physician', 'Tech visionary', 'Chef', 'Science writer', 'Game designer', 'Cartoonist', 'Producer', 'Violinist', 'Researcher', 'Social Media Theorist', 'Environmentalist, futurist', 'Data scientist', 'Musician, activist', 'Sculptor', 'Chemist', 'Sound consultant'], dtype='object')
# filter DataFrame to include only those occupations ted_top_occupations = ted[ted.speaker_occupation.isin(top_occupations)] ted_top_occupations.shape
(786, 24)
# redo the previous groupby ted_top_occupations.groupby('speaker_occupation').funny_rate.mean().sort_values()
speaker_occupation Surgeon 0.002465 Physician 0.004515 Photojournalist 0.004908 Investor and advocate for moral leadership 0.005198 Photographer 0.007152 Environmentalist, futurist 0.007317 Violinist 0.009534 Singer-songwriter 0.010597 Chemist 0.010970 Philanthropist 0.012522 Activist 0.012539 Astrophysicist 0.013147 Oceanographer 0.014596 Paleontologist 0.015780 Social psychologist 0.015887 Tech visionary 0.016654 Sculptor 0.016960 Social Media Theorist 0.017450 Social entrepreneur 0.017921 Inventor 0.021801 Sound consultant 0.022011 Legal activist 0.022303 Historian 0.023215 Musician, activist 0.023395 Economist 0.025488 Writer, activist 0.026665 Journalist 0.027997 Computer scientist 0.029070 Architect 0.030579 Engineer 0.031711 ... Roboticist 0.042777 Astronomer 0.044581 Psychologist 0.044984 Musician 0.045336 Physicist 0.046302 Filmmaker 0.048603 Futurist 0.050460 Behavioral economist 0.050460 Technologist 0.050965 Chef 0.054207 Science writer 0.055993 Designer 0.059287 Writer 0.060745 Game designer 0.062317 Reporter 0.066250 Evolutionary biologist 0.069157 Novelist 0.070876 Entrepreneur 0.073295 Author 0.075508 Artist 0.078939 Global health expert; data visionary 0.090306 Poet 0.107398 Graphic designer 0.135718 Techno-illusionist 0.152171 Cartoonist 0.162120 Data scientist 0.184076 Producer 0.202531 Singer/songwriter 0.252205 Performance poet, multimedia artist 0.306468 Comedian 0.512457 Name: funny_rate, Length: 68, dtype: float64