Using TFIDF to Segment Artists¶
In this exercise you're going to answer the age old question of "What is that artist actually singing about?"
Assignment¶
- Use the skills we just went over to calcualte TFIDF scores for terms in a song (document) against the terms used in all songs (corpus).
- Use these scores to judge what the song is about.
Extra
- Expand this to the artist. Use TFIDF to figure out what an artist sings about.
Loading the Song Lyrics Dataset¶
from google.colab import drive
import pandas as pd
drive.mount('/content/gdrive')
df = pd.read_csv('/content/gdrive/My Drive/datasets/songs.csv')
Getting out the lyrics of every song¶
all_terms = df['Lyrics'].str.cat(sep=" ").split()
Getting the terms out of a specific song¶
tylor_df = df[df["Artist"] == "Taylor Swift"]
lover_song_lyrics = taylor_df[df["Title"] == "Lover"].iloc[0]["Lyrics"]
lover_song_terms = lover_song_lyrics.split()
The TFIDF Example from the Slides¶
import math
corpus = ["John likes to watch movies.",
"Mary likes movies too.",
"John also likes to watch football games."]
tgt_doc = corpus[2]
# Term Frequency
term_freq = {}
for word in tgt_doc.split():
term_freq[word] = term_freq.get(word, 0) + 1
# Inverse Document Frequency
idf = {}
for doc in corpus:
for word in set(doc.split()):
idf[word] = idf.get(word, 0) + 1
for word in idf:
idf[word] = math.log(len(corpus) / idf[word])
# TF-IDF
tfidf = {}
for word in tgt_doc.split():
tfidf[word] = term_freq[word] * idf[word]
print(tfidf)
Spacy Preprocessing Example¶
import spacy
from nltk.stem import PorterStemmer
nlp = spacy.load("en_core_web_sm")
# Remove punctuation
doc = nlp(sentence) # Initialize as a spaCy object (list of tokens)
words = []
for token in doc:
if not token.is_punct:
words.append(token.text)
sentence = ' '.join(words)
print(sentence)
# Remove stop words
doc = nlp(sentence)
words = []
for token in doc:
if not token.is_stop:
words.append(token.text)
sentence = ' '.join(words)
print(sentence)
# Stemming
doc = sentence.split(" ")
stemmer = PorterStemmer()
words = []
for token in doc:
words.append(stemmer.stem(token))
sentence = ' '.join(words)
print(sentence)
# Lemmatization
oc = nlp(sentence)
words = []
for token in doc:
words.append(token.lemma_)
sentence = ' '.join(words)
Note
By default, you're using the _sm
(small) spacy language model. If you'd like to use the larger (better) model, run this line:
!python -m spacy download en_core_web_lg
And then refernce it when you load the nlp
variable:
nlp = spacy.load("en_core_web_sm")
In [31]:
Copied!
import pandas as pd
import math
import sys
from google.colab import drive
drive.mount('/content/gdrive')
import spacy
from nltk.stem import PorterStemmer
nlp = spacy.load("en_core_web_sm")
stemmer = PorterStemmer()
def get_lyrics(songs_df, artist, title):
"""Given the songs.csv dataframe, pulls out the lyrics for a particular artist and song.
"""
return songs_df[songs_df["Artist"] == artist][df["Title"] == title].iloc[0]["Lyrics"]
def preprocess(doc_str, with_stemming=False, with_lemmatization=False):
"""preprocess takes a string, doc_str, and returns the string preprocessed.
By default, preprocessing means lowercasing, removing punctuation, and
removing stop words.
Optionally, you may stem or lemmatize as well by passing with_stemming=True
or with_lemmatization=True.
"""
# Lowercase
doc_str = doc_str.lower()
doc = nlp(doc_str) # Initialize as a spaCy object (list of tokens)
words = []
for token in doc:
# Skip punctuation and stop words
if not token.is_punct and not token.is_stop:
text = token.text
if with_lemmatization:
text = token.lemma_
if with_stemming:
text = stemmer.stem(text)
words.append(text)
# Turn them back into one string
doc_str = " ".join(words)
return doc_str
def tfidf(corpus, tgt_doc):
"""tfidf takes a corpus (a list of strings) and a tgt_doc (a string within
corpus) and returns a dictionary of TFIDF scores for each term in tgt_doc.
"""
# Term Frequency
term_freq = {}
for word in tgt_doc.split(" "):
term_freq[word] = term_freq.get(word, 0) + 1
# Inverse Document Frequency
idf = {}
for doc in corpus:
for word in set(doc.split(" ")):
idf[word] = idf.get(word, 0) + 1
for word in idf:
idf[word] = math.log(len(corpus) / idf[word])
# TF-IDF
tfidf = {}
for word in tgt_doc.split(" "):
tfidf[word] = term_freq[word] * idf[word]
return tfidf
df = pd.read_csv('/content/gdrive/My Drive/datasets/songs.csv')
corpus = list(df["Lyrics"])
tgt_doc = get_lyrics(df, "Taylor Swift", "Lover")
# Preprocess
tgt_doc = preprocess(tgt_doc)
for i, doc in enumerate(corpus):
print(f"\rpreprocessing {i}/{len(corpus)}", end='')
sys.stdout.flush()
corpus[i] = preprocess(doc)
tfidf_dict = tfidf(corpus, tgt_doc)
tfidf_df = pd.DataFrame(tfidf_dict.items(), columns=["term", "tfidf"])
tfidf_df.sort_values("tfidf", ascending=False)
import pandas as pd
import math
import sys
from google.colab import drive
drive.mount('/content/gdrive')
import spacy
from nltk.stem import PorterStemmer
nlp = spacy.load("en_core_web_sm")
stemmer = PorterStemmer()
def get_lyrics(songs_df, artist, title):
"""Given the songs.csv dataframe, pulls out the lyrics for a particular artist and song.
"""
return songs_df[songs_df["Artist"] == artist][df["Title"] == title].iloc[0]["Lyrics"]
def preprocess(doc_str, with_stemming=False, with_lemmatization=False):
"""preprocess takes a string, doc_str, and returns the string preprocessed.
By default, preprocessing means lowercasing, removing punctuation, and
removing stop words.
Optionally, you may stem or lemmatize as well by passing with_stemming=True
or with_lemmatization=True.
"""
# Lowercase
doc_str = doc_str.lower()
doc = nlp(doc_str) # Initialize as a spaCy object (list of tokens)
words = []
for token in doc:
# Skip punctuation and stop words
if not token.is_punct and not token.is_stop:
text = token.text
if with_lemmatization:
text = token.lemma_
if with_stemming:
text = stemmer.stem(text)
words.append(text)
# Turn them back into one string
doc_str = " ".join(words)
return doc_str
def tfidf(corpus, tgt_doc):
"""tfidf takes a corpus (a list of strings) and a tgt_doc (a string within
corpus) and returns a dictionary of TFIDF scores for each term in tgt_doc.
"""
# Term Frequency
term_freq = {}
for word in tgt_doc.split(" "):
term_freq[word] = term_freq.get(word, 0) + 1
# Inverse Document Frequency
idf = {}
for doc in corpus:
for word in set(doc.split(" ")):
idf[word] = idf.get(word, 0) + 1
for word in idf:
idf[word] = math.log(len(corpus) / idf[word])
# TF-IDF
tfidf = {}
for word in tgt_doc.split(" "):
tfidf[word] = term_freq[word] * idf[word]
return tfidf
df = pd.read_csv('/content/gdrive/My Drive/datasets/songs.csv')
corpus = list(df["Lyrics"])
tgt_doc = get_lyrics(df, "Taylor Swift", "Lover")
# Preprocess
tgt_doc = preprocess(tgt_doc)
for i, doc in enumerate(corpus):
print(f"\rpreprocessing {i}/{len(corpus)}", end='')
sys.stdout.flush()
corpus[i] = preprocess(doc)
tfidf_dict = tfidf(corpus, tgt_doc)
tfidf_df = pd.DataFrame(tfidf_dict.items(), columns=["term", "tfidf"])
tfidf_df.sort_values("tfidf", ascending=False)
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
<ipython-input-31-d4f166802a2d>:18: UserWarning: Boolean Series key will be reindexed to match DataFrame index. return songs_df[songs_df["Artist"] == artist][df["Title"] == title].iloc[0]["Lyrics"]
Out[31]:
term | tfidf | |
---|---|---|
67 | joob | 105.814147 |
28 | goo | 82.883319 |
70 | g'joob | 26.453537 |
26 | eggman | 26.453537 |
69 | g'goo | 26.453537 |
... | ... | ... |
75 | think | 1.442900 |
119 | oh | 0.982172 |
122 | know | 0.604571 |
135 | urlcopyembedcopy | 0.349986 |
0 | \n | 0.051041 |
136 rows × 2 columns
In [18]:
Copied!
# With Lemmetization
corpus = list(df["Lyrics"])
tgt_doc = get_lyrics(df, "Taylor Swift", "Love")
# Preprocess
tgt_doc = preprocess(tgt_doc, with_lemmatization=True)
for i, doc in enumerate(corpus):
sys.stdout.flush()
print(f"\rpreprocessing {i}/{len(corpus)}", end='')
corpus[i] = preprocess(doc, with_lemmatization=True)
tfidf_dict = tfidf(corpus, tgt_doc)
tfidf_df = pd.DataFrame(tfidf_dict.items(), columns=["term", "tfidf"])
tfidf_df.sort_values("tfidf", ascending=False)
# With Lemmetization
corpus = list(df["Lyrics"])
tgt_doc = get_lyrics(df, "Taylor Swift", "Love")
# Preprocess
tgt_doc = preprocess(tgt_doc, with_lemmatization=True)
for i, doc in enumerate(corpus):
sys.stdout.flush()
print(f"\rpreprocessing {i}/{len(corpus)}", end='')
corpus[i] = preprocess(doc, with_lemmatization=True)
tfidf_dict = tfidf(corpus, tgt_doc)
tfidf_df = pd.DataFrame(tfidf_dict.items(), columns=["term", "tfidf"])
tfidf_df.sort_values("tfidf", ascending=False)
preprocessing 0/745
<ipython-input-18-ab14268a5eec>:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index. tgt_doc = df[df["Artist"] == "Taylor Swift"][df["Title"] == "Lover"].iloc[0]["Lyrics"]
preprocessing 744/745
Out[18]:
term | tfidf | |
---|---|---|
18 | forever | 12.851665 |
21 | lover | 12.851665 |
17 | close | 7.094667 |
19 | ah | 6.808736 |
28 | suspicious | 6.613384 |
... | ... | ... |
16 | \n\n | 0.832623 |
31 | love | 0.681139 |
13 | know | 0.482158 |
60 | urlcopyembedcopy | 0.349986 |
5 | \n | 0.037609 |
61 rows × 2 columns
In [17]:
Copied!
# With Stemming and Lemmetization
corpus = list(df["Lyrics"])
tgt_doc = get_lyrics(df, "Taylor Swift", "Love")
# Preprocess
tgt_doc = preprocess(tgt_doc, with_stemming=True, with_lemmatization=True)
for i, doc in enumerate(corpus):
sys.stdout.flush()
print(f"\rpreprocessing {i}/{len(corpus)}", end='')
corpus[i] = preprocess(doc, with_stemming=True, with_lemmatization=True)
tfidf_dict = tfidf(corpus, tgt_doc)
tfidf_df = pd.DataFrame(tfidf_dict.items(), columns=["term", "tfidf"])
tfidf_df.sort_values("tfidf", ascending=False)
# With Stemming and Lemmetization
corpus = list(df["Lyrics"])
tgt_doc = get_lyrics(df, "Taylor Swift", "Love")
# Preprocess
tgt_doc = preprocess(tgt_doc, with_stemming=True, with_lemmatization=True)
for i, doc in enumerate(corpus):
sys.stdout.flush()
print(f"\rpreprocessing {i}/{len(corpus)}", end='')
corpus[i] = preprocess(doc, with_stemming=True, with_lemmatization=True)
tfidf_dict = tfidf(corpus, tgt_doc)
tfidf_df = pd.DataFrame(tfidf_dict.items(), columns=["term", "tfidf"])
tfidf_df.sort_values("tfidf", ascending=False)
preprocessing 0/745
<ipython-input-17-edd58372a846>:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index. tgt_doc = df[df["Artist"] == "Taylor Swift"][df["Title"] == "Lover"].iloc[0]["Lyrics"]
preprocessing 744/745
Out[17]:
term | tfidf | |
---|---|---|
18 | forev | 12.851665 |
21 | lover | 12.851665 |
17 | close | 7.010154 |
19 | ah | 6.808736 |
50 | overdramat | 6.613384 |
... | ... | ... |
16 | \n\n | 0.832623 |
31 | love | 0.673213 |
13 | know | 0.482158 |
60 | urlcopyembedcopi | 0.349986 |
5 | \n | 0.037609 |
61 rows × 2 columns
In [25]:
Copied!
def get_lyrics(songs_df, artist, title):
"""Given the songs.csv dataframe, pulls out the lyrics for a particular artist and song.
"""
return songs_df[songs_df["Artist"] == artist][df["Title"] == title].iloc[0]["Lyrics"]
get_lyrics(df, "Taylor Swift", "Lover")
def get_lyrics(songs_df, artist, title):
"""Given the songs.csv dataframe, pulls out the lyrics for a particular artist and song.
"""
return songs_df[songs_df["Artist"] == artist][df["Title"] == title].iloc[0]["Lyrics"]
get_lyrics(df, "Taylor Swift", "Lover")
<ipython-input-25-10ffb6885a7e>:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index. return songs_df[songs_df["Artist"] == artist][df["Title"] == title].iloc[0]["Lyrics"]
Out[25]:
"We could leave the Christmas lights up 'til January\nAnd this is our place, we make the rules\nAnd there's a dazzling haze, a mysterious way about you, dear\nHave I known you twenty seconds or twenty years?\n\nCan I go where you go?\nCan we always be this close?\nForever and ever, ah\nTake me out, and take me home\nYou're my, my, my, my lover\n\nWe could let our friends crash in the living room\nThis is our place, we make the call\nAnd I'm highly suspicious that everyone who sees you wants you\nI've loved you three summers now, honey, but I want 'em all\n\nCan I go where you go?\nCan we always be this close?\nForever and ever, ah\nTake me out, and take me home (Forever and ever)\nYou're my, my, my, my lover\nLadies and gentlemen, will you please stand?\nWith every guitar string scar on my hand\nI take this magnetic force of a man to be my lover\nMy heart's been borrowed and yours has been blue\nAll's well that ends well to end up with you\nSwear to be overdramatic and true to my lover\nAnd you'll save all your dirtiest jokes for me\nAnd at every table, I'll save you a seat, lover\n\nCan I go where you go?\nCan we always be this close?\nForever and ever, ah\nTake me out, and take me home (Forever and ever)\nYou're my, my, my, my\nOh, you're my, my, my, my\nDarling, you're my, my, my, my lover108EmbedShare URLCopyEmbedCopy"
In [30]:
Copied!
df[df["Artist"] == "The Beatles"]["Title"].unique()
df[df["Artist"] == "The Beatles"]["Title"].unique()
Out[30]:
array(['Let It Be', 'Yesterday', 'Come Together', 'Hey Jude', 'Here Comes the Sun', 'Something', 'A Day in the Life', 'Blackbird', 'I Am the Walrus', 'Eleanor Rigby', 'In My Life', 'While My Guitar Gently Weeps', 'Lucy in the Sky with Diamonds', 'Across the Universe', 'Strawberry Fields Forever', 'With a Little Help from My Friends', 'Michelle', 'Help!', 'I Want to Hold Your Hand', 'Ob-La-Di, Ob-La-Da', 'Yellow Submarine', 'Norwegian Wood (This Bird Has Flown)', 'Don’t Let Me Down', 'When I’m Sixty-Four', 'Rocky Raccoon', 'Oh! Darling', 'Back in the U.S.S.R.', 'Happiness is a Warm Gun', 'Golden Slumbers', 'Penny Lane', 'Helter Skelter', 'All You Need Is Love', 'Maxwell’s Silver Hammer', 'She Loves You', 'Revolution'], dtype=object)
In [ ]:
Copied!