Using TFIDF to Segment Artists¶

In this exercise you're going to answer the age old question of "What is that artist actually singing about?"

Assignment¶

Use the skills we just went over to calcualte TFIDF scores for terms in a song (document) against the terms used in all songs (corpus).
Use these scores to judge what the song is about.

Extra

Expand this to the artist. Use TFIDF to figure out what an artist sings about.

Loading the Song Lyrics Dataset¶

from google.colab import drive
import pandas as pd

drive.mount('/content/gdrive')
df = pd.read_csv('/content/gdrive/My Drive/datasets/songs.csv')

Getting out the lyrics of every song¶

all_terms = df['Lyrics'].str.cat(sep=" ").split()

Getting the terms out of a specific song¶

tylor_df = df[df["Artist"] == "Taylor Swift"]
lover_song_lyrics = taylor_df[df["Title"] == "Lover"].iloc[0]["Lyrics"]
lover_song_terms = lover_song_lyrics.split()

The TFIDF Example from the Slides¶

import math

corpus = ["John likes to watch movies.",
          "Mary likes movies too.",
          "John also likes to watch football games."]
tgt_doc = corpus[2]

# Term Frequency
term_freq = {}
for word in tgt_doc.split():
    term_freq[word] = term_freq.get(word, 0) + 1

# Inverse Document Frequency
idf = {}
for doc in corpus:
    for word in set(doc.split()):
        idf[word] = idf.get(word, 0) + 1
for word in idf:
    idf[word] = math.log(len(corpus) / idf[word])

# TF-IDF
tfidf = {}
for word in tgt_doc.split():
    tfidf[word] = term_freq[word] * idf[word]
print(tfidf)

Spacy Preprocessing Example¶

import spacy
from nltk.stem import PorterStemmer

nlp = spacy.load("en_core_web_sm")

# Remove punctuation
doc = nlp(sentence)  # Initialize as a spaCy object (list of tokens)
words = []
for token in doc:
    if not token.is_punct:
        words.append(token.text)
sentence = ' '.join(words)
print(sentence)

# Remove stop words
doc = nlp(sentence)
words = []
for token in doc:
    if not token.is_stop:
        words.append(token.text)
sentence = ' '.join(words)
print(sentence)

# Stemming
doc = sentence.split(" ")
stemmer = PorterStemmer()
words = []
for token in doc:
    words.append(stemmer.stem(token))
sentence = ' '.join(words)
print(sentence)

# Lemmatization
oc = nlp(sentence)
words = []
for token in doc:
    words.append(token.lemma_)
sentence = ' '.join(words)

Note

By default, you're using the _sm (small) spacy language model. If you'd like to use the larger (better) model, run this line:

!python -m spacy download en_core_web_lg

And then refernce it when you load the nlp variable:

nlp = spacy.load("en_core_web_sm")

In [31]:

Copied!





import pandas as pd
import math
import sys

from google.colab import drive
drive.mount('/content/gdrive')

import spacy
from nltk.stem import PorterStemmer

nlp = spacy.load("en_core_web_sm")
stemmer = PorterStemmer()


def get_lyrics(songs_df, artist, title):
    """Given the songs.csv dataframe, pulls out the lyrics for a particular artist and song.
    """
    return songs_df[songs_df["Artist"] == artist][df["Title"] == title].iloc[0]["Lyrics"]

def preprocess(doc_str, with_stemming=False, with_lemmatization=False):
    """preprocess takes a string, doc_str, and returns the string preprocessed.

    By default, preprocessing means lowercasing, removing punctuation, and
    removing stop words.

    Optionally, you may stem or lemmatize as well by passing with_stemming=True
    or with_lemmatization=True.
    """
    # Lowercase
    doc_str = doc_str.lower()
    doc = nlp(doc_str)  # Initialize as a spaCy object (list of tokens)
    words = []
    for token in doc:
        # Skip punctuation and stop words
        if not token.is_punct and not token.is_stop:
            text = token.text
            if with_lemmatization:
                text = token.lemma_
            if with_stemming:
                text = stemmer.stem(text)
            words.append(text)

    # Turn them back into one string
    doc_str = " ".join(words)
    return doc_str


def tfidf(corpus, tgt_doc):
    """tfidf takes a corpus (a list of strings) and a tgt_doc (a string within
    corpus) and returns a dictionary of TFIDF scores for each term in tgt_doc.
    """
    # Term Frequency
    term_freq = {}
    for word in tgt_doc.split(" "):
        term_freq[word] = term_freq.get(word, 0) + 1

    # Inverse Document Frequency
    idf = {}
    for doc in corpus:
        for word in set(doc.split(" ")):
            idf[word] = idf.get(word, 0) + 1
    for word in idf:
        idf[word] = math.log(len(corpus) / idf[word])

    # TF-IDF
    tfidf = {}
    for word in tgt_doc.split(" "):
       tfidf[word] = term_freq[word] * idf[word]

    return tfidf

df = pd.read_csv('/content/gdrive/My Drive/datasets/songs.csv')

corpus = list(df["Lyrics"])
tgt_doc = get_lyrics(df, "Taylor Swift", "Lover")

# Preprocess
tgt_doc = preprocess(tgt_doc)
for i, doc in enumerate(corpus):
   print(f"\rpreprocessing {i}/{len(corpus)}", end='')
   sys.stdout.flush()
   corpus[i] = preprocess(doc)

tfidf_dict = tfidf(corpus, tgt_doc)
tfidf_df = pd.DataFrame(tfidf_dict.items(), columns=["term", "tfidf"])
tfidf_df.sort_values("tfidf", ascending=False)
import pandas as pd
import math
import sys

from google.colab import drive
drive.mount('/content/gdrive')

import spacy
from nltk.stem import PorterStemmer

nlp = spacy.load("en_core_web_sm")
stemmer = PorterStemmer()


def get_lyrics(songs_df, artist, title):
    """Given the songs.csv dataframe, pulls out the lyrics for a particular artist and song.
    """
    return songs_df[songs_df["Artist"] == artist][df["Title"] == title].iloc[0]["Lyrics"]

def preprocess(doc_str, with_stemming=False, with_lemmatization=False):
    """preprocess takes a string, doc_str, and returns the string preprocessed.

    By default, preprocessing means lowercasing, removing punctuation, and
    removing stop words.

    Optionally, you may stem or lemmatize as well by passing with_stemming=True
    or with_lemmatization=True.
    """
    # Lowercase
    doc_str = doc_str.lower()
    doc = nlp(doc_str)  # Initialize as a spaCy object (list of tokens)
    words = []
    for token in doc:
        # Skip punctuation and stop words
        if not token.is_punct and not token.is_stop:
            text = token.text
            if with_lemmatization:
                text = token.lemma_
            if with_stemming:
                text = stemmer.stem(text)
            words.append(text)

    # Turn them back into one string
    doc_str = " ".join(words)
    return doc_str


def tfidf(corpus, tgt_doc):
    """tfidf takes a corpus (a list of strings) and a tgt_doc (a string within
    corpus) and returns a dictionary of TFIDF scores for each term in tgt_doc.
    """
    # Term Frequency
    term_freq = {}
    for word in tgt_doc.split(" "):
        term_freq[word] = term_freq.get(word, 0) + 1

    # Inverse Document Frequency
    idf = {}
    for doc in corpus:
        for word in set(doc.split(" ")):
            idf[word] = idf.get(word, 0) + 1
    for word in idf:
        idf[word] = math.log(len(corpus) / idf[word])

    # TF-IDF
    tfidf = {}
    for word in tgt_doc.split(" "):
       tfidf[word] = term_freq[word] * idf[word]

    return tfidf

df = pd.read_csv('/content/gdrive/My Drive/datasets/songs.csv')

corpus = list(df["Lyrics"])
tgt_doc = get_lyrics(df, "Taylor Swift", "Lover")

# Preprocess
tgt_doc = preprocess(tgt_doc)
for i, doc in enumerate(corpus):
   print(f"\rpreprocessing {i}/{len(corpus)}", end='')
   sys.stdout.flush()
   corpus[i] = preprocess(doc)

tfidf_dict = tfidf(corpus, tgt_doc)
tfidf_df = pd.DataFrame(tfidf_dict.items(), columns=["term", "tfidf"])
tfidf_df.sort_values("tfidf", ascending=False)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).

<ipython-input-31-d4f166802a2d>:18: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  return songs_df[songs_df["Artist"] == artist][df["Title"] == title].iloc[0]["Lyrics"]

Out[31]:

	term	tfidf
67	joob	105.814147
28	goo	82.883319
70	g'joob	26.453537
26	eggman	26.453537
69	g'goo	26.453537
...	...	...
75	think	1.442900
119	oh	0.982172
122	know	0.604571
135	urlcopyembedcopy	0.349986
0	\n	0.051041

136 rows × 2 columns

In [18]:

Copied!





# With Lemmetization

corpus = list(df["Lyrics"])
tgt_doc = get_lyrics(df, "Taylor Swift", "Love")

# Preprocess
tgt_doc = preprocess(tgt_doc, with_lemmatization=True)
for i, doc in enumerate(corpus):
    sys.stdout.flush()
    print(f"\rpreprocessing {i}/{len(corpus)}", end='')
    corpus[i] = preprocess(doc, with_lemmatization=True)

tfidf_dict = tfidf(corpus, tgt_doc)
tfidf_df = pd.DataFrame(tfidf_dict.items(), columns=["term", "tfidf"])
tfidf_df.sort_values("tfidf", ascending=False)
# With Lemmetization

corpus = list(df["Lyrics"])
tgt_doc = get_lyrics(df, "Taylor Swift", "Love")

# Preprocess
tgt_doc = preprocess(tgt_doc, with_lemmatization=True)
for i, doc in enumerate(corpus):
    sys.stdout.flush()
    print(f"\rpreprocessing {i}/{len(corpus)}", end='')
    corpus[i] = preprocess(doc, with_lemmatization=True)

tfidf_dict = tfidf(corpus, tgt_doc)
tfidf_df = pd.DataFrame(tfidf_dict.items(), columns=["term", "tfidf"])
tfidf_df.sort_values("tfidf", ascending=False)

preprocessing 0/745

<ipython-input-18-ab14268a5eec>:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  tgt_doc = df[df["Artist"] == "Taylor Swift"][df["Title"] == "Lover"].iloc[0]["Lyrics"]

preprocessing 744/745

Out[18]:

	term	tfidf
18	forever	12.851665
21	lover	12.851665
17	close	7.094667
19	ah	6.808736
28	suspicious	6.613384
...	...	...
16	\n\n	0.832623
31	love	0.681139
13	know	0.482158
60	urlcopyembedcopy	0.349986
5	\n	0.037609

61 rows × 2 columns

In [17]:

Copied!





# With Stemming and Lemmetization

corpus = list(df["Lyrics"])
tgt_doc = get_lyrics(df, "Taylor Swift", "Love")

# Preprocess
tgt_doc = preprocess(tgt_doc, with_stemming=True, with_lemmatization=True)
for i, doc in enumerate(corpus):
    sys.stdout.flush()
    print(f"\rpreprocessing {i}/{len(corpus)}", end='')
    corpus[i] = preprocess(doc, with_stemming=True, with_lemmatization=True)

tfidf_dict = tfidf(corpus, tgt_doc)
tfidf_df = pd.DataFrame(tfidf_dict.items(), columns=["term", "tfidf"])
tfidf_df.sort_values("tfidf", ascending=False)
# With Stemming and Lemmetization

corpus = list(df["Lyrics"])
tgt_doc = get_lyrics(df, "Taylor Swift", "Love")

# Preprocess
tgt_doc = preprocess(tgt_doc, with_stemming=True, with_lemmatization=True)
for i, doc in enumerate(corpus):
    sys.stdout.flush()
    print(f"\rpreprocessing {i}/{len(corpus)}", end='')
    corpus[i] = preprocess(doc, with_stemming=True, with_lemmatization=True)

tfidf_dict = tfidf(corpus, tgt_doc)
tfidf_df = pd.DataFrame(tfidf_dict.items(), columns=["term", "tfidf"])
tfidf_df.sort_values("tfidf", ascending=False)

preprocessing 0/745

<ipython-input-17-edd58372a846>:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  tgt_doc = df[df["Artist"] == "Taylor Swift"][df["Title"] == "Lover"].iloc[0]["Lyrics"]

preprocessing 744/745

Out[17]:

	term	tfidf
18	forev	12.851665
21	lover	12.851665
17	close	7.010154
19	ah	6.808736
50	overdramat	6.613384
...	...	...
16	\n\n	0.832623
31	love	0.673213
13	know	0.482158
60	urlcopyembedcopi	0.349986
5	\n	0.037609

61 rows × 2 columns

In [25]:

Copied!





def get_lyrics(songs_df, artist, title):
    """Given the songs.csv dataframe, pulls out the lyrics for a particular artist and song.
    """
    return songs_df[songs_df["Artist"] == artist][df["Title"] == title].iloc[0]["Lyrics"]

get_lyrics(df, "Taylor Swift", "Lover")
def get_lyrics(songs_df, artist, title):
    """Given the songs.csv dataframe, pulls out the lyrics for a particular artist and song.
    """
    return songs_df[songs_df["Artist"] == artist][df["Title"] == title].iloc[0]["Lyrics"]

get_lyrics(df, "Taylor Swift", "Lover")

<ipython-input-25-10ffb6885a7e>:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  return songs_df[songs_df["Artist"] == artist][df["Title"] == title].iloc[0]["Lyrics"]

Out[25]:

"We could leave the Christmas lights up 'til January\nAnd this is our place, we make the rules\nAnd there's a dazzling haze, a mysterious way about you, dear\nHave I known you twenty seconds or twenty years?\n\nCan I go where you go?\nCan we always be this close?\nForever and ever, ah\nTake me out, and take me home\nYou're my, my, my, my lover\n\nWe could let our friends crash in the living room\nThis is our place, we make the call\nAnd I'm highly suspicious that everyone who sees you wants you\nI've loved you three summers now, honey, but I want 'em all\n\nCan I go where you go?\nCan we always be this close?\nForever and ever, ah\nTake me out, and take me home (Forever and ever)\nYou're my, my, my, my lover\nLadies and gentlemen, will you please stand?\nWith every guitar string scar on my hand\nI take this magnetic force of a man to be my lover\nMy heart's been borrowed and yours has been blue\nAll's well that ends well to end up with you\nSwear to be overdramatic and true to my lover\nAnd you'll save all your dirtiest jokes for me\nAnd at every table, I'll save you a seat, lover\n\nCan I go where you go?\nCan we always be this close?\nForever and ever, ah\nTake me out, and take me home (Forever and ever)\nYou're my, my, my, my\nOh, you're my, my, my, my\nDarling, you're my, my, my, my lover108EmbedShare URLCopyEmbedCopy"

In [30]:

Copied!

df[df["Artist"] == "The Beatles"]["Title"].unique()
df[df["Artist"] == "The Beatles"]["Title"].unique()

Out[30]:

array(['Let It Be', 'Yesterday', 'Come Together', 'Hey Jude',
       'Here Comes the Sun', 'Something', 'A Day in the Life',
       'Blackbird', 'I Am the Walrus', 'Eleanor Rigby', 'In My Life',
       'While My Guitar Gently Weeps', 'Lucy in the Sky with Diamonds',
       'Across the Universe', 'Strawberry Fields Forever',
       'With a Little Help from My Friends', 'Michelle', 'Help!',
       'I Want to Hold Your Hand', 'Ob-La-Di, Ob-La-Da',
       'Yellow Submarine', 'Norwegian Wood (This Bird Has Flown)',
       'Don’t Let Me Down', 'When I’m Sixty-Four', 'Rocky Raccoon',
       'Oh! Darling', 'Back in the U.S.S.R.', 'Happiness is a Warm Gun',
       'Golden Slumbers', 'Penny Lane', 'Helter Skelter',
       'All You Need Is Love', 'Maxwell’s Silver Hammer', 'She Loves You',
       'Revolution'], dtype=object)

In [ ]: