{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "from bs4 import BeautifulSoup\n", "import glob\n", "import re\n", "import json\n", "import csv\n", "import os" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "# had to delete one file from both Metadata and word-count file due to the ngram1 file not containing any information. journal-article-10.2307_1196540-ngram1.txt\n", "total_freq = {}\n", "\n", "# The json schema here is {pub-year: {word: count}}\n", "\n", "for xml in glob.iglob('Metadata/*.xml'):\n", " with open(xml) as f:\n", " bs = BeautifulSoup(f, \"lxml-xml\")\n", " \n", " # Extract pub lish year\n", " pub_year = bs.year\n", " year = int(str(pub_year)[6:10])\n", " \n", " total_freq[year] = total_freq.get(year, {})\n", "\n", " \n", " txt = xml.replace(\"Metadata\", \"word-count\").replace(\".xml\", \"-ngram1.txt\")\n", " \n", " \n", " \n", " with open(txt) as t:\n", " for line in t:\n", " sub = re.split(\"\\s+\", line)\n", " word = sub[0]\n", " count = int(sub[1])\n", " \n", " if total_freq[year].get(word, 0) == 0:\n", " total_freq[year][word] = 0\n", "\n", " total_freq[year][word] += count\n", "\n", " \n", "file = open(\"count.json\", \"w\")\n", " \n", "with file:\n", " json.dump(total_freq, file)" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df = pd.DataFrame.from_dict(total_freq, orient = 'index')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/ext/anaconda2020.02/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3343: FutureWarning: This dataframe has a column name that matches the 'value_name' column name of the resultiing Dataframe. In the future this will raise an error, please set the 'value_name' parameter of DataFrame.melt to a unique name.\n", " exec(code_obj, self.user_global_ns, self.user_ns)\n" ] } ], "source": [ "long_df = pd.melt(df, ignore_index = False)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "long_df.reset_index(inplace=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | index | \n", "variable | \n", "value | \n", "
---|---|---|---|
0 | \n", "1998 | \n", "s | \n", "278.0 | \n", "
1 | \n", "1952 | \n", "s | \n", "33.0 | \n", "
2 | \n", "1987 | \n", "s | \n", "278.0 | \n", "
3 | \n", "1930 | \n", "s | \n", "105.0 | \n", "
4 | \n", "1925 | \n", "s | \n", "245.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
5957674 | \n", "2005 | \n", "varg | \n", "NaN | \n", "
5957675 | \n", "1958 | \n", "varg | \n", "NaN | \n", "
5957676 | \n", "1962 | \n", "varg | \n", "NaN | \n", "
5957677 | \n", "1971 | \n", "varg | \n", "NaN | \n", "
5957678 | \n", "1959 | \n", "varg | \n", "1.0 | \n", "
5957679 rows × 3 columns
\n", "\n", " | index | \n", "variable | \n", "value | \n", "
---|---|---|---|
273 | \n", "1998 | \n", "book | \n", "101.0 | \n", "
274 | \n", "1952 | \n", "book | \n", "11.0 | \n", "
275 | \n", "1987 | \n", "book | \n", "90.0 | \n", "
276 | \n", "1930 | \n", "book | \n", "70.0 | \n", "
277 | \n", "1925 | \n", "book | \n", "155.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
5957674 | \n", "2005 | \n", "varg | \n", "NaN | \n", "
5957675 | \n", "1958 | \n", "varg | \n", "NaN | \n", "
5957676 | \n", "1962 | \n", "varg | \n", "NaN | \n", "
5957677 | \n", "1971 | \n", "varg | \n", "NaN | \n", "
5957678 | \n", "1959 | \n", "varg | \n", "1.0 | \n", "
5956951 rows × 3 columns
\n", "