Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download
Views: 17875
Image: ubuntu2004
Kernel: Python 3 (Anaconda 2020)
from bs4 import BeautifulSoup import glob import re import json import csv import os
# had to delete one file from both Metadata and word-count file due to the ngram1 file not containing any information. journal-article-10.2307_1196540-ngram1.txt total_freq = {} # The json schema here is {pub-year: {word: count}} for xml in glob.iglob('Metadata/*.xml'): with open(xml) as f: bs = BeautifulSoup(f, "lxml-xml") # Extract pub lish year pub_year = bs.year year = int(str(pub_year)[6:10]) total_freq[year] = total_freq.get(year, {}) txt = xml.replace("Metadata", "word-count").replace(".xml", "-ngram1.txt") with open(txt) as t: for line in t: sub = re.split("\s+", line) word = sub[0] count = int(sub[1]) if total_freq[year].get(word, 0) == 0: total_freq[year][word] = 0 total_freq[year][word] += count file = open("count.json", "w") with file: json.dump(total_freq, file)
import pandas as pd
df = pd.DataFrame.from_dict(total_freq, orient = 'index')
long_df = pd.melt(df, ignore_index = False)
/ext/anaconda2020.02/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3343: FutureWarning: This dataframe has a column name that matches the 'value_name' column name of the resultiing Dataframe. In the future this will raise an error, please set the 'value_name' parameter of DataFrame.melt to a unique name. exec(code_obj, self.user_global_ns, self.user_ns)
long_df.reset_index(inplace=True)
long_df
index variable value
0 1998 s 278.0
1 1952 s 33.0
2 1987 s 278.0
3 1930 s 105.0
4 1925 s 245.0
... ... ... ...
5957674 2005 varg NaN
5957675 1958 varg NaN
5957676 1962 varg NaN
5957677 1971 varg NaN
5957678 1959 varg 1.0

5957679 rows × 3 columns

#I tried creatinga loop for removing stop words from the data frame but it pulled out the variable column, so I guess I'm doing this probably the worst way long_df.drop(long_df.index[long_df['variable'] == 's'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '00'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '95'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '0'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '1'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'i'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'pp'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'has'], inplace = True)
long_df
index variable value
273 1998 book 101.0
274 1952 book 11.0
275 1987 book 90.0
276 1930 book 70.0
277 1925 book 155.0
... ... ... ...
5957674 2005 varg NaN
5957675 1958 varg NaN
5957676 1962 varg NaN
5957677 1971 varg NaN
5957678 1959 varg 1.0

5956951 rows × 3 columns

long_df.drop(long_df.index[long_df['variable'] == 'new'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'which'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'from'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'his'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'its'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'have'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'all'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'he'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'one'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'been'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'pages'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'we'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'would'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'had'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'j'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'our'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'cloth'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'hardback'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'you'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'p'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'new'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'york'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'press'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'were'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'any'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'isbn'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '35'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '800'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '86554'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'ii'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'e'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'd'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'her'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'b'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'who'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'also'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'what'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'more'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'may'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'should'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'those'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'ibid'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '86554'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'h'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '6'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '50'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '3'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'car'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'same'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'l'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'see'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'my'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'so'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'between'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'other'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'us'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'she'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'many'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'work'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'can'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'your'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '2'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '4'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '75'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'only'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'them'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'than'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'factor'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '5'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'co'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'two'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'most'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'must'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'upon'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'me'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'do'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'cannot'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'when'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'some'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'through'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'people'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'much'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'w'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'm'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'person'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'does'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'him'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'st'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'm'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'yet'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'c'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'x'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'could'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'd'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'when'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'how'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'whether'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'yet'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'out'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'even'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'type'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'action'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'well'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'very'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'mr'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'dr'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'made'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'own'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'far'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'himself'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '25'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == '7'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'now'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'each'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'like'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'here'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'about'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'within'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'n'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 't'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'both'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'r'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'n'], inplace = True)
long_df.drop(long_df.index[long_df['variable'] == 'thus'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'itself'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'still'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'make'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'sense'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'focus'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'never'], inplace = True) long_df.drop(long_df.index[long_df['variable'] == 'while'], inplace = True)
top_fifteen_words = long_df.groupby('index').apply(lambda x : x.sort_values(by = 'value', ascending = False).head(15).reset_index(drop = True))
pd.set_option('display.max_row', 2000)
top_ten_words = long_df.groupby('index').apply(lambda x : x.sort_values(by = 'value', ascending = False).head(10).reset_index(drop = True))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-97077e0e5b89> in <module> ----> 1 top_ten_words = long_df.groupby('index').apply(lambda x : x.sort_values(by = 'value', ascending = False).head(10).reset_index(drop = True)) NameError: name 'long_df' is not defined
#You can now see the top_ten_words or the top_fifteen_words used over time. All you need to do is type top_ten_words or top_fifteen_words and then hit command return. #top_ten_words #top_fifteen_words