Hamlet NLP Preprocessing with Apache Spark and Sci-Kit Learn¶

By Nigel Story

Introduction¶

This notebook is an excercise in text preprocessing using Apache Spark's RDD API and functional programming. The goal is simple: to ingest the text of Shakespear's The Tragedy of Hamlet, Prince of Denmark and to transform it into a dataset that could be used for a bag-of-words model.

Packages¶

import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext()

import re
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords

from sklearn.feature_extraction.text import TfidfVectorizer

import functools
from functools import partial

The Raw Data¶

raw_hamlet = sc.textFile('Desktop/hamlet.txt')
raw_hamlet.take(25)

['hamlet@0\t\tHAMLET',
 'hamlet@8',
 'hamlet@9',
 'hamlet@10\t\tDRAMATIS PERSONAE',
 'hamlet@29',
 'hamlet@30',
 'hamlet@31\tCLAUDIUS\tking of Denmark. (KING CLAUDIUS:)',
 'hamlet@74',
 'hamlet@75\tHAMLET\tson to the late, and nephew to the present king.',
 'hamlet@131',
 'hamlet@132\tPOLONIUS\tlord chamberlain. (LORD POLONIUS:)',
 'hamlet@176',
 'hamlet@177\tHORATIO\tfriend to Hamlet.',
 'hamlet@203',
 'hamlet@204\tLAERTES\tson to Polonius.',
 'hamlet@229',
 'hamlet@230\tLUCIANUS\tnephew to the king.',
 'hamlet@259',
 'hamlet@260',
 'hamlet@261\tVOLTIMAND\t|',
 'hamlet@273\t\t|',
 'hamlet@276\tCORNELIUS\t|',
 'hamlet@288\t\t|',
 'hamlet@291\tROSENCRANTZ\t|  courtiers.',
 'hamlet@317\t\t|']

Cleaning the Data¶

def get_line_id(line):
    line_id = line[0].replace('hamlet@', '')
    line[0] = int(line_id)
    return line

def no_pipes(line):
    line_out = [line[0]] + []
    for l in line[1:]:
        if l == '|':
            pass
        else:
            line_out.append(l.replace('|', ''))
            
    return line_out

hamlet_data = (raw_hamlet.map(lambda x: x.split('\t')) # split tab delimeter
                         .map(lambda line: get_line_id(line)) # get line numbers, convert to int
                         .map(lambda x: no_pipes(x)) # get rid of pipe characters
                         .map(lambda x: [l for l in x if l != '']).filter(lambda x: len(x)>1) # get rid of empty lines
                         .filter(lambda x: x[1][0] != '[') # get rid of stage directions
                         .filter(lambda x: x[0] >= 1014) # skip introduction text
              )

hamlet_ls = hamlet_data.collect()

hamlet_ls[:20]

[[1014, 'ACT I'],
 [1023, 'SCENE I', 'Elsinore. A platform before the castle.'],
 [1122, 'BERNARDO', "Who's there?"],
 [1145, 'FRANCISCO', 'Nay, answer me: stand, and unfold yourself.'],
 [1200, 'BERNARDO', 'Long live the king!'],
 [1230, 'FRANCISCO', 'Bernardo?'],
 [1251, 'BERNARDO', 'He.'],
 [1265, 'FRANCISCO', 'You come most carefully upon your hour.'],
 [1316, 'BERNARDO', "'Tis now struck twelve; get thee to bed, Francisco."],
 [1378, 'FRANCISCO', "For this relief much thanks: 'tis bitter cold,"],
 [1435, 'And I am sick at heart.'],
 [1461, 'BERNARDO', 'Have you had quiet guard?'],
 [1497, 'FRANCISCO', 'Not a mouse stirring.'],
 [1530, 'BERNARDO', 'Well, good night.'],
 [1557, 'If you do meet Horatio and Marcellus,'],
 [1596, 'The rivals of my watch, bid them make haste.'],
 [1643, 'FRANCISCO', "I think I hear them. Stand, ho! Who's there?"],
 [1731, 'HORATIO', 'Friends to this ground.'],
 [1764, 'MARCELLUS', 'And liegemen to the Dane.'],
 [1801, 'FRANCISCO', 'Give you good night.']]

act_lines = [x for x in hamlet_ls if re.search(r'ACT [I]*V?[I]*', x[1])]
act_lines

[[1014, 'ACT I'],
 [9154, 'ACT I'],
 [21496, 'ACT I'],
 [27792, 'ACT I'],
 [32223, 'ACT I'],
 [40963, 'ACT II'],
 [46656, 'ACT II'],
 [75016, 'ACT III'],
 [84102, 'ACT III'],
 [102901, 'ACT III'],
 [107559, 'ACT III'],
 [118014, 'ACT IV'],
 [120324, 'ACT IV'],
 [121780, 'ACT IV'],
 [125242, 'ACT IV'],
 [128383, 'ACT IV'],
 [138311, 'ACT IV'],
 [140005, 'ACT IV'],
 [149316, 'ACT V'],
 [163347, 'ACT V']]

acts_dict = {}
for act in act_lines:
    if act[1] in acts_dict:
        acts_dict[act[1]].append(act[0])
    else:
        acts_dict[act[1]] = [act[0]]

acts_dict

{'ACT I': [1014, 9154, 21496, 27792, 32223],
 'ACT II': [40963, 46656],
 'ACT III': [75016, 84102, 102901, 107559],
 'ACT IV': [118014, 120324, 121780, 125242, 128383, 138311, 140005],
 'ACT V': [149316, 163347]}

for k in acts_dict.keys():
    acts_dict[k] = min(acts_dict[k])

acts_dict

{'ACT I': 1014,
 'ACT II': 40963,
 'ACT III': 75016,
 'ACT IV': 118014,
 'ACT V': 149316}

def add_act(line):
    act = ''
    
    for k in acts_dict.keys():
        if line[0] >= acts_dict[k]:
            act = k
        
    out_line = [line[0], act] + line[1:]
    
    return out_line

hamlet_data = (hamlet_data.map(lambda x: add_act(x)) # add act value to each line
                          .filter(lambda x: not(re.search(r'ACT [I]*V?[I]', x[2]))) # no lines with act only
              )
                          
hamlet_data.take(10)

[[1023, 'ACT I', 'SCENE I', 'Elsinore. A platform before the castle.'],
 [1122, 'ACT I', 'BERNARDO', "Who's there?"],
 [1145, 'ACT I', 'FRANCISCO', 'Nay, answer me: stand, and unfold yourself.'],
 [1200, 'ACT I', 'BERNARDO', 'Long live the king!'],
 [1230, 'ACT I', 'FRANCISCO', 'Bernardo?'],
 [1251, 'ACT I', 'BERNARDO', 'He.'],
 [1265, 'ACT I', 'FRANCISCO', 'You come most carefully upon your hour.'],
 [1316,
  'ACT I',
  'BERNARDO',
  "'Tis now struck twelve; get thee to bed, Francisco."],
 [1378,
  'ACT I',
  'FRANCISCO',
  "For this relief much thanks: 'tis bitter cold,"],
 [1435, 'ACT I', 'And I am sick at heart.']]

no_act_lines_ls = hamlet_data.collect()

scene_lines = [x for x in no_act_lines_ls if re.search(r'SCENE [I]*V?[I]*', x[2])]

scene_lines

[[1023, 'ACT I', 'SCENE I', 'Elsinore. A platform before the castle.'],
 [9163, 'ACT I', 'SCENE II', 'A room of state in the castle.'],
 [21505, 'ACT I', 'SCENE III', "A room in Polonius' house."],
 [27801, 'ACT I', 'SCENE IV', 'The platform.'],
 [32232, 'ACT I', 'SCENE V', 'Another part of the platform.'],
 [40973, 'ACT II', 'SCENE I', "A room in POLONIUS' house."],
 [46666, 'ACT II', 'SCENE II', 'A room in the castle.'],
 [75027, 'ACT III', 'SCENE I', 'A room in the castle.'],
 [84113, 'ACT III', 'SCENE II', 'A hall in the castle.'],
 [102912, 'ACT III', 'SCENE III', 'A room in the castle.'],
 [107570, 'ACT III', 'SCENE IV', "The Queen's closet."],
 [118024, 'ACT IV', 'SCENE I', 'A room in the castle.'],
 [120334, 'ACT IV', 'SCENE II', 'Another room in the castle.'],
 [121790, 'ACT IV', 'SCENE III', 'Another room in the castle.'],
 [125252, 'ACT IV', 'SCENE IV', 'A plain in Denmark.'],
 [128392, 'ACT IV', 'SCENE V', 'Elsinore. A room in the castle.'],
 [138321, 'ACT IV', 'SCENE VI', 'Another room in the castle.'],
 [140014, 'ACT IV', 'SCENE VII', 'Another room in the castle.'],
 [149325, 'ACT V', 'SCENE I', 'A churchyard.'],
 [163356, 'ACT V', 'SCENE II', 'A hall in the castle.']]

scenes_dict = {}
for s in scene_lines:
    if s[1] in scenes_dict:
        scenes_dict[s[1]][s[2]] = s[0]
    else:
        scenes_dict[s[1]] = {s[2]: s[0]}

scenes_dict

{'ACT I': {'SCENE I': 1023,
  'SCENE II': 9163,
  'SCENE III': 21505,
  'SCENE IV': 27801,
  'SCENE V': 32232},
 'ACT II': {'SCENE I': 40973, 'SCENE II': 46666},
 'ACT III': {'SCENE I': 75027,
  'SCENE II': 84113,
  'SCENE III': 102912,
  'SCENE IV': 107570},
 'ACT IV': {'SCENE I': 118024,
  'SCENE II': 120334,
  'SCENE III': 121790,
  'SCENE IV': 125252,
  'SCENE V': 128392,
  'SCENE VI': 138321,
  'SCENE VII': 140014},
 'ACT V': {'SCENE I': 149325, 'SCENE II': 163356}}

def add_scene(line):
    scene = ''
    
    for a in scenes_dict.keys():
        for s in scenes_dict[a].keys():
            if line[0] >= scenes_dict[a][s]:
                scene = s
                
    out_line = line[0:2] + [scene] + line[2:]
                
    return out_line

hamlet_data = (hamlet_data.map(lambda x: add_scene(x)) # add scene to each line
                          .filter(lambda x: not(re.search(r'SCENE [I]*V?[I]*', x[3]))) # no scene only lines
              )

hamlet_lol = hamlet_data.collect()
hamlet_lol = [x[:3]+['']+x[3:] if len(x) == 4 else x for x in hamlet_lol]

wrong_len = [x for x in hamlet_lol if len(x) != 5]
wrong_len

[[37168, 'ACT I', 'SCENE V', 'MARCELLUS', '[Within]', 'Lord Hamlet,--'],
 [37203, 'ACT I', 'SCENE V', 'HORATIO', '[Within]', 'Heaven secure him!']]

hamlet_lol = [x[:4]+[x[5]] if x[4] == '[Within]' else x for x in hamlet_lol]

cols = ['line_no', 'act', 'scene', 'speaker', 'text']

df = pd.DataFrame(hamlet_lol, columns=cols)

df.head(20)

def fill_with_previous(ls):
    out = []
    for i in range(len(ls)):
        if ls[i] == '':
            out.append(out[i-1])
        else:
            out.append(ls[i])
            
    return out

names_ls = list(df['speaker'])
names_ls[:10]

['BERNARDO',
 'FRANCISCO',
 'BERNARDO',
 'FRANCISCO',
 'BERNARDO',
 'FRANCISCO',
 'BERNARDO',
 'FRANCISCO',
 '',
 'BERNARDO']

names_ls = fill_with_previous(names_ls)

df['speaker'] = names_ls

df.head(20)

df['speaker'].unique()

array(['BERNARDO', 'FRANCISCO', 'HORATIO', 'MARCELLUS', 'KING CLAUDIUS',
       'LAERTES', 'LORD POLONIUS', 'HAMLET', 'QUEEN GERTRUDE', 'All',
       'OPHELIA', 'Ghost', 'REYNALDO', 'ROSENCRANTZ', 'GUILDENSTERN',
       'VOLTIMAND', 'First Player', 'Prologue', 'Player King',
       'Player Queen', 'LUCIANUS', 'PRINCE FORTINBRAS', 'Captain',
       'Gentleman', 'Danes', 'Servant', 'First Sailor', 'Messenger',
       'First Clown', 'Second Clown', 'First Priest', 'OSRIC', 'Lord',
       'First Ambassador'], dtype=object)

prologue = df.loc[df['speaker']=='Prologue', :]
prologue

df = df.loc[df['speaker']!='Prologue', :]

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4085 entries, 0 to 4087
Data columns (total 5 columns):
line_no    4085 non-null int64
act        4085 non-null object
scene      4085 non-null object
speaker    4085 non-null object
text       4085 non-null object
dtypes: int64(1), object(4)
memory usage: 191.5+ KB

NLP Pre-Processing¶

df.text[:10]

0                                         Who's there?
1          Nay, answer me: stand, and unfold yourself.
2                                  Long live the king!
3                                            Bernardo?
4                                                  He.
5              You come most carefully upon your hour.
6    'Tis now struck twelve; get thee to bed, Franc...
7       For this relief much thanks: 'tis bitter cold,
8                              And I am sick at heart.
9                            Have you had quiet guard?
Name: text, dtype: object

df['q_marks'] = df['text'].str.count('\?')
df['ex_marks'] = df['text'].str.count('\!')

df['words_only'] = df['text'].str.replace(r'\W+', ' ')

df.words_only

0                                            Who s there 
1                Nay answer me stand and unfold yourself 
2                                     Long live the king 
3                                               Bernardo 
4                                                     He 
                              ...                        
4083                                Speak loudly for him 
4084              Take up the bodies such a sight as this
4085         Becomes the field but here shows much amiss 
4086                           Go bid the soldiers shoot 
4087    bodies after which a peal of ordnance is shot ...
Name: words_only, Length: 4085, dtype: object

def compose(*functions):
    return functools.reduce(lambda f, g: lambda x: g(f(x)), functions, lambda x: x)

def get_tokens(line):
    return nltk.word_tokenize(line)

def get_wordnet_pos(word):
    pos_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    tag = nltk.pos_tag([word])[0][1][0]
    
    return pos_dict.get(tag, wordnet.NOUN)

def lemma_gen(lemmatizer, token_ls):
    for word in token_ls:
        yield lemmatizer().lemmatize(word, get_wordnet_pos(word))
        
lemma_gen = partial(lemma_gen, WordNetLemmatizer)

def extract_lem(lem_gen):
    return [l for l in lem_gen if l != 's']

def no_stopwords(ls):
    return ' '.join([w.lower() for w in ls if w.lower() not in stopwords.words('english')])

pipeline = compose(get_tokens, lemma_gen, extract_lem, no_stopwords)

lems = pipeline('I have seen you at the store.')

lems

'see store .'

df['lemmatized'] = df['words_only'].apply(pipeline)

df.head(20)

df = df.loc[df['lemmatized'].str.len() > 0]

df.head()

vectorizer = TfidfVectorizer()
vectorized = vectorizer.fit_transform(df['lemmatized'])

X_train = pd.DataFrame.sparse.from_spmatrix(vectorized)

col_map = {v:k for k,v in vectorizer.vocabulary_.items()}

for col in X_train.columns:
    X_train.rename(columns={col: col_map[col]}, inplace=True)

X_train.head()

len(X_train)

4053

	abate	abatement	abhor	ability	able	aboard	abominably	abridgement	abroad	absent	...	yesty	yet	yield	yon	yond	yonder	yorick	young	youth	zone
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	line_no	act	scene	speaker	text
0	1122	ACT I	SCENE I	BERNARDO	Who's there?
1	1145	ACT I	SCENE I	FRANCISCO	Nay, answer me: stand, and unfold yourself.
2	1200	ACT I	SCENE I	BERNARDO	Long live the king!
3	1230	ACT I	SCENE I	FRANCISCO	Bernardo?
4	1251	ACT I	SCENE I	BERNARDO	He.
5	1265	ACT I	SCENE I	FRANCISCO	You come most carefully upon your hour.
6	1316	ACT I	SCENE I	BERNARDO	'Tis now struck twelve; get thee to bed, Franc...
7	1378	ACT I	SCENE I	FRANCISCO	For this relief much thanks: 'tis bitter cold,
8	1435	ACT I	SCENE I		And I am sick at heart.
9	1461	ACT I	SCENE I	BERNARDO	Have you had quiet guard?
10	1497	ACT I	SCENE I	FRANCISCO	Not a mouse stirring.
11	1530	ACT I	SCENE I	BERNARDO	Well, good night.
12	1557	ACT I	SCENE I		If you do meet Horatio and Marcellus,
13	1596	ACT I	SCENE I		The rivals of my watch, bid them make haste.
14	1643	ACT I	SCENE I	FRANCISCO	I think I hear them. Stand, ho! Who's there?
15	1731	ACT I	SCENE I	HORATIO	Friends to this ground.
16	1764	ACT I	SCENE I	MARCELLUS	And liegemen to the Dane.
17	1801	ACT I	SCENE I	FRANCISCO	Give you good night.
18	1833	ACT I	SCENE I	MARCELLUS	O, farewell, honest soldier:
19	1872	ACT I	SCENE I		Who hath relieved you?

	line_no	act	scene	speaker	text
2054	91871	ACT III	SCENE II	Prologue	For us, and for our tragedy,
2055	91914	ACT III	SCENE II	Prologue	Here stooping to your clemency,
2056	91947	ACT III	SCENE II	Prologue	We beg your hearing patiently.

	abate	abatement	abhor	ability	able	aboard	abominably	abridgement	abroad	absent	...	yesty	yet	yield	yon	yond	yonder	yorick	young	youth	zone
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	abate	abatement	abhor	ability	able	aboard	abominably	abridgement	abroad	absent	...	yesty	yet	yield	yon	yond	yonder	yorick	young	youth	zone
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	abate	abatement	abhor	ability	able	aboard	abominably	abridgement	abroad	absent	...	yesty	yet	yield	yon	yond	yonder	yorick	young	youth	zone
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0