{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import glob\n",
    "import re\n",
    "import json\n",
    "import csv\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "# had to delete one file from both Metadata and word-count file due to the ngram1 file not containing any information. journal-article-10.2307_1196540-ngram1.txt\n",
    "total_freq = {}\n",
    "\n",
    "# The json schema here is {pub-year: {word: count}}\n",
    "\n",
    "for xml in glob.iglob('Metadata/*.xml'):\n",
    "    with open(xml) as f:\n",
    "        bs = BeautifulSoup(f, \"lxml-xml\")\n",
    "    \n",
    "    # Extract pub lish year\n",
    "    pub_year = bs.year\n",
    "    year = int(str(pub_year)[6:10])\n",
    "    \n",
    "    total_freq[year] = total_freq.get(year, {})\n",
    "\n",
    "    \n",
    "    txt = xml.replace(\"Metadata\", \"word-count\").replace(\".xml\", \"-ngram1.txt\")\n",
    "    \n",
    "    \n",
    "    \n",
    "    with open(txt) as t:\n",
    "        for line in t:\n",
    "            sub = re.split(\"\\s+\", line)\n",
    "            word = sub[0]\n",
    "            count = int(sub[1])\n",
    "            \n",
    "            if total_freq[year].get(word, 0) == 0:\n",
    "                total_freq[year][word] = 0\n",
    "\n",
    "            total_freq[year][word] += count\n",
    "\n",
    "    \n",
    "file = open(\"count.json\", \"w\")\n",
    "    \n",
    "with file:\n",
    "    json.dump(total_freq, file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "df = pd.DataFrame.from_dict(total_freq, orient = 'index')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/ext/anaconda2020.02/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3343: FutureWarning: This dataframe has a column name that matches the 'value_name' column name of the resultiing Dataframe. In the future this will raise an error, please set the 'value_name' parameter of DataFrame.melt to a unique name.\n",
      "  exec(code_obj, self.user_global_ns, self.user_ns)\n"
     ]
    }
   ],
   "source": [
    "long_df = pd.melt(df, ignore_index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "long_df.reset_index(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>variable</th>\n",
       "      <th>value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1998</td>\n",
       "      <td>s</td>\n",
       "      <td>278.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1952</td>\n",
       "      <td>s</td>\n",
       "      <td>33.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1987</td>\n",
       "      <td>s</td>\n",
       "      <td>278.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1930</td>\n",
       "      <td>s</td>\n",
       "      <td>105.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1925</td>\n",
       "      <td>s</td>\n",
       "      <td>245.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5957674</th>\n",
       "      <td>2005</td>\n",
       "      <td>varg</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5957675</th>\n",
       "      <td>1958</td>\n",
       "      <td>varg</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5957676</th>\n",
       "      <td>1962</td>\n",
       "      <td>varg</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5957677</th>\n",
       "      <td>1971</td>\n",
       "      <td>varg</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5957678</th>\n",
       "      <td>1959</td>\n",
       "      <td>varg</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5957679 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         index variable  value\n",
       "0         1998        s  278.0\n",
       "1         1952        s   33.0\n",
       "2         1987        s  278.0\n",
       "3         1930        s  105.0\n",
       "4         1925        s  245.0\n",
       "...        ...      ...    ...\n",
       "5957674   2005     varg    NaN\n",
       "5957675   1958     varg    NaN\n",
       "5957676   1962     varg    NaN\n",
       "5957677   1971     varg    NaN\n",
       "5957678   1959     varg    1.0\n",
       "\n",
       "[5957679 rows x 3 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "long_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "#I tried creatinga loop for removing stop words from the data frame but it pulled out the variable column, so I guess I'm doing this probably the worst way\n",
    "\n",
    "long_df.drop(long_df.index[long_df['variable'] == 's'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == '00'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == '95'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == '0'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == '1'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'i'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'pp'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'has'], inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>variable</th>\n",
       "      <th>value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>273</th>\n",
       "      <td>1998</td>\n",
       "      <td>book</td>\n",
       "      <td>101.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>274</th>\n",
       "      <td>1952</td>\n",
       "      <td>book</td>\n",
       "      <td>11.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>275</th>\n",
       "      <td>1987</td>\n",
       "      <td>book</td>\n",
       "      <td>90.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>276</th>\n",
       "      <td>1930</td>\n",
       "      <td>book</td>\n",
       "      <td>70.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>277</th>\n",
       "      <td>1925</td>\n",
       "      <td>book</td>\n",
       "      <td>155.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5957674</th>\n",
       "      <td>2005</td>\n",
       "      <td>varg</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5957675</th>\n",
       "      <td>1958</td>\n",
       "      <td>varg</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5957676</th>\n",
       "      <td>1962</td>\n",
       "      <td>varg</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5957677</th>\n",
       "      <td>1971</td>\n",
       "      <td>varg</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5957678</th>\n",
       "      <td>1959</td>\n",
       "      <td>varg</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5956951 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         index variable  value\n",
       "273       1998     book  101.0\n",
       "274       1952     book   11.0\n",
       "275       1987     book   90.0\n",
       "276       1930     book   70.0\n",
       "277       1925     book  155.0\n",
       "...        ...      ...    ...\n",
       "5957674   2005     varg    NaN\n",
       "5957675   1958     varg    NaN\n",
       "5957676   1962     varg    NaN\n",
       "5957677   1971     varg    NaN\n",
       "5957678   1959     varg    1.0\n",
       "\n",
       "[5956951 rows x 3 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {
     },
     "output_type": "execute_result"
    }
   ],
   "source": [
    "long_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "long_df.drop(long_df.index[long_df['variable'] == 'new'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'which'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'from'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'his'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'its'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'have'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'all'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'he'], inplace = True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "long_df.drop(long_df.index[long_df['variable'] == 'one'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'been'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'pages'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'we'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'would'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'had'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'j'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'our'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'cloth'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'hardback'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'you'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'p'], inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "long_df.drop(long_df.index[long_df['variable'] == 'new'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'york'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'press'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'were'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'any'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'isbn'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == '35'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == '800'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == '86554'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'ii'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'e'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'd'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'her'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'b'], inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "long_df.drop(long_df.index[long_df['variable'] == 'who'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'also'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'what'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'more'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'may'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'should'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'those'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'ibid'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == '86554'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'h'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == '6'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == '50'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == '3'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'car'], inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "long_df.drop(long_df.index[long_df['variable'] == 'same'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'l'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'see'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'my'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'so'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'between'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'other'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'us'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'she'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'many'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'work'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'can'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'your'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == '2'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == '4'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == '75'], inplace = True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "long_df.drop(long_df.index[long_df['variable'] == 'only'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'them'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'than'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'factor'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == '5'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'co'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'two'], inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "long_df.drop(long_df.index[long_df['variable'] == 'most'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'must'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'upon'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'me'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'do'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'cannot'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'when'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'some'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'through'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'people'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'much'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'w'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'm'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'person'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'does'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'him'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'st'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'm'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'yet'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'c'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'x'], inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "long_df.drop(long_df.index[long_df['variable'] == 'could'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'd'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'when'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'how'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'whether'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'yet'], inplace = True)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "long_df.drop(long_df.index[long_df['variable'] == 'out'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'even'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'type'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'action'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'well'], inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "long_df.drop(long_df.index[long_df['variable'] == 'very'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'mr'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'dr'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'made'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'own'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'far'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'himself'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == '25'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == '7'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'now'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'each'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'like'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'here'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'about'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'within'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'n'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 't'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'both'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'r'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'n'], inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "long_df.drop(long_df.index[long_df['variable'] == 'thus'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'itself'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'still'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'make'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'sense'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'focus'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'never'], inplace = True)\n",
    "long_df.drop(long_df.index[long_df['variable'] == 'while'], inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "top_fifteen_words = long_df.groupby('index').apply(lambda x : x.sort_values(by = 'value', ascending = False).head(15).reset_index(drop = True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "pd.set_option('display.max_row', 2000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'long_df' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-1-97077e0e5b89>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtop_ten_words\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlong_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'index'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mby\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'value'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mascending\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreset_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdrop\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'long_df' is not defined"
     ]
    }
   ],
   "source": [
    "top_ten_words = long_df.groupby('index').apply(lambda x : x.sort_values(by = 'value', ascending = False).head(10).reset_index(drop = True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
    "#You can now see the top_ten_words or the top_fifteen_words used over time. All you need to do is type top_ten_words or top_fifteen_words and then hit command return. \n",
    "\n",
    "#top_ten_words\n",
    "#top_fifteen_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
   ],
   "source": [
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (Anaconda 2020)",
   "env": {
    "ADDR2LINE": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-addr2line",
    "ANACONDA2019": "$EXT/anaconda-2019.03",
    "ANACONDA2020": "/ext/anaconda2020.02",
    "ANACONDA3": "$EXT/anaconda3",
    "ANACONDA5": "$EXT/anaconda5",
    "AR": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-ar",
    "AS": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-as",
    "CC": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-cc",
    "CFLAGS": "-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include",
    "CMAKE_PREFIX_PATH": "/ext/anaconda2020.02:/ext/anaconda2020.02/x86_64-conda_cos6-linux-gnu/sysroot/usr",
    "CONDA_BACKUP_HOST": "x86_64-conda_cos6-linux-gnu",
    "CONDA_BUILD_SYSROOT": "/ext/anaconda2020.02/x86_64-conda_cos6-linux-gnu/sysroot",
    "CONDA_DEFAULT_ENV": "base",
    "CONDA_EXE": "/ext/anaconda2020.02/bin/conda",
    "CONDA_MKL_INTERFACE_LAYER_BACKUP": "",
    "CONDA_PREFIX": "/ext/anaconda2020.02",
    "CONDA_PROMPT_MODIFIER": "(base) ",
    "CONDA_PYTHON_EXE": "/ext/anaconda2020.02/bin/python",
    "CONDA_SHLVL": "1",
    "CPP": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-cpp",
    "CPPFLAGS": "-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /ext/anaconda2020.02/include",
    "CXX": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-c++",
    "CXXFILT": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-c++filt",
    "CXXFLAGS": "-fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include",
    "DEBUG_CFLAGS": "-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include",
    "DEBUG_CPPFLAGS": "-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /ext/anaconda2020.02/include",
    "DEBUG_CXXFLAGS": "-fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include",
    "DEBUG_FFLAGS": "-fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include -fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fcheck=all -fbacktrace -fimplicit-none -fvar-tracking-assignments -ffunction-sections -pipe",
    "DEBUG_FORTRANFLAGS": "-fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include -fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fcheck=all -fbacktrace -fimplicit-none -fvar-tracking-assignments -ffunction-sections -pipe",
    "ELFEDIT": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-elfedit",
    "F77": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gfortran",
    "F90": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gfortran",
    "F95": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-f95",
    "FC": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gfortran",
    "FFLAGS": "-fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include",
    "FORTRANFLAGS": "-fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include",
    "GCC": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gcc",
    "GCC_AR": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gcc-ar",
    "GCC_NM": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gcc-nm",
    "GCC_RANLIB": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gcc-ranlib",
    "GDAL_DATA": "/ext/anaconda2020.02/share/gdal",
    "GFORTRAN": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gfortran",
    "GPROF": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gprof",
    "GXX": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-g++",
    "JAVA_HOME": "/ext/anaconda2020.02",
    "JAVA_HOME_CONDA_BACKUP": "",
    "JAVA_LD_LIBRARY_PATH": "/ext/anaconda2020.02/lib/server",
    "LD": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-ld",
    "LDFLAGS": "-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,-rpath,/ext/anaconda2020.02/lib -Wl,-rpath-link,/ext/anaconda2020.02/lib -L/ext/anaconda2020.02/lib",
    "LD_GOLD": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-ld.gold",
    "LD_LIBRARY_PATH": "/ext/anaconda2020.02/lib",
    "MKL_INTERFACE_LAYER": "LP64,GNU",
    "MKL_THREADING_LAYER": "GNU",
    "NM": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-nm",
    "OBJCOPY": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-objcopy",
    "OBJDUMP": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-objdump",
    "OLDPWD": "/ext/anaconda2020.02",
    "PATH": "/ext/anaconda2020.02/bin:/ext/anaconda2020.02/bin:/ext/anaconda2020.02/condabin:/cocalc/bin:/cocalc/src/smc-project/bin:/home/salvus/bin:/home/salvus/.local/bin:/usr/lib/xpra:/ext/bin:/opt/ghc/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin:/usr/lib/postgresql/10/bin:/ext/data/homer/bin:/ext/data/weblogo",
    "PROJ_LIB": "/ext/anaconda2020.02/share/proj",
    "RANLIB": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-ranlib",
    "READELF": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-readelf",
    "RSTUDIO_WHICH_R": "/ext/anaconda2020.02/bin/R",
    "SIZE": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-size",
    "STRINGS": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-strings",
    "STRIP": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-strip",
    "_CE_CONDA": "",
    "_CE_M": "",
    "_CONDA_PYTHON_SYSCONFIGDATA_NAME": "_sysconfigdata_x86_64_conda_cos6_linux_gnu"
   },
   "language": "python",
   "metadata": {
    "cocalc": {
     "description": "Python/R distribution for data science",
     "priority": 5,
     "url": "https://www.anaconda.com/distribution/"
    }
   },
   "name": "anaconda2020",
   "resource_dir": "/ext/jupyter/kernels/anaconda2020"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}