CoCalc -- 2021-04-30-word-frequency-over-time.ipynb

Project: Jeremee Nute - REL 370: Digital Texts in the Humanities

Path: Final Project / 2021-04-30-word-frequency-over-time.ipynb

Views: ¹⁷⁸⁷⁵
Image: ubuntu2004

Kernel: Python 3 (Anaconda 2020)

In [1]:

from bs4 import BeautifulSoup
import glob
import re
import json
import csv
import os

In [2]:

# had to delete one file from both Metadata and word-count file due to the ngram1 file not containing any information. journal-article-10.2307_1196540-ngram1.txt
total_freq = {}

# The json schema here is {pub-year: {word: count}}

for xml in glob.iglob('Metadata/*.xml'):
    with open(xml) as f:
        bs = BeautifulSoup(f, "lxml-xml")
    
    # Extract pub lish year
    pub_year = bs.year
    year = int(str(pub_year)[6:10])
    
    total_freq[year] = total_freq.get(year, {})

    
    txt = xml.replace("Metadata", "word-count").replace(".xml", "-ngram1.txt")
    
    
    
    with open(txt) as t:
        for line in t:
            sub = re.split("\s+", line)
            word = sub[0]
            count = int(sub[1])
            
            if total_freq[year].get(word, 0) == 0:
                total_freq[year][word] = 0

            total_freq[year][word] += count

    
file = open("count.json", "w")
    
with file:
    json.dump(total_freq, file)

In [0]:

In [3]:

import pandas as pd

In [4]:

df = pd.DataFrame.from_dict(total_freq, orient = 'index')

In [5]:

long_df = pd.melt(df, ignore_index = False)

/ext/anaconda2020.02/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3343: FutureWarning: This dataframe has a column name that matches the 'value_name' column name of the resultiing Dataframe. In the future this will raise an error, please set the 'value_name' parameter of DataFrame.melt to a unique name.
  exec(code_obj, self.user_global_ns, self.user_ns)

In [6]:

long_df.reset_index(inplace=True)

In [7]:

long_df

	index	variable	value
0	1998	s	278.0
1	1952	s	33.0
2	1987	s	278.0
3	1930	s	105.0
4	1925	s	245.0
...	...	...	...
5957674	2005	varg	NaN
5957675	1958	varg	NaN
5957676	1962	varg	NaN
5957677	1971	varg	NaN
5957678	1959	varg	1.0

5957679 rows × 3 columns

In [0]:

In [8]:

#I tried creatinga loop for removing stop words from the data frame but it pulled out the variable column, so I guess I'm doing this probably the worst way

long_df.drop(long_df.index[long_df['variable'] == 's'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '00'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '95'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '0'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '1'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'i'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'pp'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'has'], inplace = True)

In [9]:

long_df

	index	variable	value
273	1998	book	101.0
274	1952	book	11.0
275	1987	book	90.0
276	1930	book	70.0
277	1925	book	155.0
...	...	...	...
5957674	2005	varg	NaN
5957675	1958	varg	NaN
5957676	1962	varg	NaN
5957677	1971	varg	NaN
5957678	1959	varg	1.0

5956951 rows × 3 columns

In [0]:

In [10]:

long_df.drop(long_df.index[long_df['variable'] == 'new'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'which'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'from'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'his'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'its'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'have'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'all'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'he'], inplace = True)

In [11]:

long_df.drop(long_df.index[long_df['variable'] == 'one'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'been'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'pages'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'we'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'would'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'had'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'j'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'our'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'cloth'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'hardback'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'you'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'p'], inplace = True)

In [12]:

long_df.drop(long_df.index[long_df['variable'] == 'new'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'york'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'press'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'were'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'any'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'isbn'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '35'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '800'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '86554'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'ii'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'e'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'd'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'her'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'b'], inplace = True)

In [13]:

long_df.drop(long_df.index[long_df['variable'] == 'who'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'also'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'what'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'more'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'may'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'should'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'those'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'ibid'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '86554'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'h'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '6'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '50'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '3'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'car'], inplace = True)

In [14]:

long_df.drop(long_df.index[long_df['variable'] == 'same'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'l'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'see'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'my'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'so'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'between'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'other'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'us'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'she'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'many'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'work'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'can'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'your'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '2'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '4'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '75'], inplace = True)

In [15]:

long_df.drop(long_df.index[long_df['variable'] == 'only'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'them'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'than'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'factor'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '5'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'co'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'two'], inplace = True)

In [16]:

long_df.drop(long_df.index[long_df['variable'] == 'most'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'must'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'upon'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'me'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'do'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'cannot'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'when'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'some'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'through'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'people'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'much'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'w'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'm'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'person'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'does'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'him'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'st'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'm'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'yet'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'c'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'x'], inplace = True)

In [17]:

long_df.drop(long_df.index[long_df['variable'] == 'could'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'd'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'when'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'how'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'whether'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'yet'], inplace = True)

In [18]:

long_df.drop(long_df.index[long_df['variable'] == 'out'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'even'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'type'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'action'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'well'], inplace = True)

In [19]:

long_df.drop(long_df.index[long_df['variable'] == 'very'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'mr'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'dr'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'made'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'own'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'far'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'himself'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '25'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == '7'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'now'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'each'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'like'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'here'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'about'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'within'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'n'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 't'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'both'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'r'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'n'], inplace = True)

In [20]:

long_df.drop(long_df.index[long_df['variable'] == 'thus'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'itself'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'still'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'make'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'sense'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'focus'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'never'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'while'], inplace = True)

In [21]:

top_fifteen_words = long_df.groupby('index').apply(lambda x : x.sort_values(by = 'value', ascending = False).head(15).reset_index(drop = True))

In [23]:

pd.set_option('display.max_row', 2000)

In [0]:

In [1]:

top_ten_words = long_df.groupby('index').apply(lambda x : x.sort_values(by = 'value', ascending = False).head(10).reset_index(drop = True))

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-97077e0e5b89> in <module>
----> 1 top_ten_words = long_df.groupby('index').apply(lambda x : x.sort_values(by = 'value', ascending = False).head(10).reset_index(drop = True))

NameError: name 'long_df' is not defined

In [0]:

In [0]:

#You can now see the top_ten_words or the top_fifteen_words used over time. All you need to do is type top_ten_words or top_fifteen_words and then hit command return. 

#top_ten_words
#top_fifteen_words

In [0]:

In [0]:

In [0]: