CoCalc -- 2021-04-15-092547.ipynb

Project: Jeremee Nute - REL 370: Digital Texts in the Humanities

Path: Final Project / 2021-04-15-092547.ipynb

Views: ¹⁷⁸⁷⁵
Image: ubuntu2004

Kernel: Python 3 (Anaconda 2020)

In [1]:

from bs4 import BeautifulSoup

In [2]:

import lxml

In [7]:

with open('journal-article-10.1086_427313.xml', 'r') as f:
    ugly_xml = f.read()

In [8]:

soup = BeautifulSoup(ugly_xml, 'xml')

In [9]:

soup

<?xml version="1.0" encoding="utf-8"?>
<article article-type="research-article" dtd-version="1.0" xml:lang="en" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">jreligion</journal-id>
<journal-title-group>
<journal-title>The Journal of Religion</journal-title>
</journal-title-group>
<publisher>
<publisher-name>The University of Chicago Press</publisher-name>
</publisher>
<issn pub-type="ppub">00224189</issn>
<issn pub-type="epub">15496538</issn>
<custom-meta-group/>
</journal-meta>
<article-meta>
<article-id pub-id-type="jstor-stable">3591443</article-id>
<article-id pub-id-type="doi">10.1086/427313</article-id>
<article-id pub-id-type="msid">JR850201</article-id>
<title-group>
<article-title>A Correlational Model of Comparative Theology</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" xlink:type="simple">
<string-name>
<given-names>Hugh</given-names>
<x xml:space="preserve"> </x>
<surname>Nicholson</surname>
</string-name>
<x xml:space="preserve"> </x>
</contrib>
<aff id="aff_1">Coe College</aff>
</contrib-group>
<pub-date pub-type="ppub">
<month>04</month>
<year>2005</year>
<string-date>April 2005</string-date>
</pub-date>
<volume>85</volume>
<issue>2</issue>
<issue-id>jr.2005.85.issue-2</issue-id>
<fpage>191</fpage>
<lpage>213</lpage>
<permissions>
<copyright-statement>© 2005 by The University of Chicago. All rights reserved.</copyright-statement>
<copyright-year>2005</copyright-year>
<copyright-holder>The University of Chicago</copyright-holder>
</permissions>
<self-uri xlink:href="https://www.jstor.org/stable/10.1086/427313"/>
<custom-meta-group>
<custom-meta>
<meta-name>lang</meta-name>
<meta-value>en</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<back>
</back>
</article>

In [10]:

soup.find('article-title').get_text()

'A Correlational Model of Comparative Theology'

In [19]:

data_fields = ['given-names', 'surname', 'article-title', 'string-date']

In [20]:

data_from_article = {}

for field in data_fields:
    print(field)
    content = soup.find(field).get_text()
    data_from_article[field] = content

given-names
surname
article-title
string-date

In [21]:

data_from_article

{'given-names': 'Hugh',
 'surname': 'Nicholson',
 'article-title': 'A Correlational Model of Comparative Theology',
 'string-date': 'April 2005'}

In [0]:

get all the file names form a directory

for each file name:
    open the file, load content
    pass the content into beautifulsoup
    parse the content using soup
    save results to dictionary
    
save filename and results to large dictionary

In [4]:

def extract_xml_data(xml_content, data_field_list):
   
    #soup = BeautifulSoup(xml_content, 'xml')

    data_from_article = {}
    
    for field in data_field_list:
        content = soup.find(field).get_text()
        data_from_article[field] = content
        
    return data_from_article

In [5]:

with open('journal-article-10.1086_427313.xml','r') as f:
    ugly_xml = f.read()

In [7]:

from bs4 import BeautifulSoup

In [0]:

In [16]:

fields = ['given-names', 'surname', 'article-title', 'month','year']
data_dict = extract_xml_data(ugly_xml, fields)

In [0]:

In [10]:

data_dict

{'given-names': 'Hugh',
 'surname': 'Nicholson',
 'article-title': 'A Correlational Model of Comparative Theology',
 'string-date': 'April 2005'}

In [11]:

import os

In [12]:

xml_files = os.listdir('Metadata')

In [13]:

xml_files

['journal-article-10.1086_421828.xml',
 'journal-article-10.1086_381213.xml',
 'journal-article-10.1086_424411.xml',
 'journal-article-10.1086_427313.xml',
 'journal-article-10.1086_382331.xml']

In [27]:

all_data = {}
fields = ['given-names', 'surname', 'article-title', 'month','year', 'subject']

for each in xml_files:
    with open('Metadata/{}'.format(each), 'r') as f:
        ugly_xml = f.read()
    
    soup = BeautifulSoup(ugly_xml, 'xml')
    
    subject_value = soup.find('subject').get_text()
    if subject_value == "Book Review":
        pass
    else:
        all_data[each] = extract_xml_data(ugly_xml, fields)

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-27-2db4f8998aa5> in <module>
      8     soup = BeautifulSoup(ugly_xml, 'xml')
      9 
---> 10     subject_value = soup.find('subject').get_text()
     11     if subject_value == "Book Review":
     12         pass
AttributeError: 'NoneType' object has no attribute 'get_text'

In [0]:

In [0]:

In [0]:

In [0]:

In [0]:

In [0]:

In [0]:

#def check_for_review():
    #Check the "subject" field,
    #If == "Book Review", then pass

In [22]:

all_data

{'journal-article-10.1086_421828.xml': {'given-names': 'David\xa0W.',
  'surname': 'Chappell',
  'article-title': '',
  'month': '04',
  'year': '2004'},
 'journal-article-10.1086_381213.xml': {'given-names': 'W.\xa0Clark',
  'surname': 'Gilpin',
  'article-title': 'Enlightened Genealogies of Religion: Edward Gibbon and His Contemporaries*',
  'month': '04',
  'year': '2004'},
 'journal-article-10.1086_424411.xml': {'given-names': 'Nicholas',
  'surname': 'Koss',
  'article-title': '',
  'month': '07',
  'year': '2004'},
 'journal-article-10.1086_427313.xml': {'given-names': 'Hugh',
  'surname': 'Nicholson',
  'article-title': 'A Correlational Model of Comparative Theology',
  'month': '04',
  'year': '2005'},
 'journal-article-10.1086_382331.xml': {'given-names': 'Robert\xa0Ford',
  'surname': 'Campany',
  'article-title': '',
  'month': '01',
  'year': '2004'}}

In [0]:

In [0]:

In [0]:

In [0]:

In [0]:

In [0]:

In [0]:

In [0]:

In [0]:

In [0]:

In [0]: