Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download
Views: 17875
Image: ubuntu2004
Kernel: Python 3 (Anaconda 2020)
from bs4 import BeautifulSoup
import lxml
with open('journal-article-10.1086_427313.xml', 'r') as f: ugly_xml = f.read()
soup = BeautifulSoup(ugly_xml, 'xml')
soup
<?xml version="1.0" encoding="utf-8"?> <article article-type="research-article" dtd-version="1.0" xml:lang="en" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> <front> <journal-meta> <journal-id journal-id-type="publisher-id">jreligion</journal-id> <journal-title-group> <journal-title>The Journal of Religion</journal-title> </journal-title-group> <publisher> <publisher-name>The University of Chicago Press</publisher-name> </publisher> <issn pub-type="ppub">00224189</issn> <issn pub-type="epub">15496538</issn> <custom-meta-group/> </journal-meta> <article-meta> <article-id pub-id-type="jstor-stable">3591443</article-id> <article-id pub-id-type="doi">10.1086/427313</article-id> <article-id pub-id-type="msid">JR850201</article-id> <title-group> <article-title>A Correlational Model of Comparative Theology</article-title> </title-group> <contrib-group> <contrib contrib-type="author" xlink:type="simple"> <string-name> <given-names>Hugh</given-names> <x xml:space="preserve"> </x> <surname>Nicholson</surname> </string-name> <x xml:space="preserve"> </x> </contrib> <aff id="aff_1">Coe College</aff> </contrib-group> <pub-date pub-type="ppub"> <month>04</month> <year>2005</year> <string-date>April 2005</string-date> </pub-date> <volume>85</volume> <issue>2</issue> <issue-id>jr.2005.85.issue-2</issue-id> <fpage>191</fpage> <lpage>213</lpage> <permissions> <copyright-statement>© 2005 by The University of Chicago. All rights reserved.</copyright-statement> <copyright-year>2005</copyright-year> <copyright-holder>The University of Chicago</copyright-holder> </permissions> <self-uri xlink:href="https://www.jstor.org/stable/10.1086/427313"/> <custom-meta-group> <custom-meta> <meta-name>lang</meta-name> <meta-value>en</meta-value> </custom-meta> </custom-meta-group> </article-meta> </front> <back> </back> </article>
soup.find('article-title').get_text()
'A Correlational Model of Comparative Theology'
data_fields = ['given-names', 'surname', 'article-title', 'string-date']
data_from_article = {} for field in data_fields: print(field) content = soup.find(field).get_text() data_from_article[field] = content
given-names surname article-title string-date
data_from_article
{'given-names': 'Hugh', 'surname': 'Nicholson', 'article-title': 'A Correlational Model of Comparative Theology', 'string-date': 'April 2005'}
get all the file names form a directory for each file name: open the file, load content pass the content into beautifulsoup parse the content using soup save results to dictionary save filename and results to large dictionary
def extract_xml_data(xml_content, data_field_list): #soup = BeautifulSoup(xml_content, 'xml') data_from_article = {} for field in data_field_list: content = soup.find(field).get_text() data_from_article[field] = content return data_from_article
with open('journal-article-10.1086_427313.xml','r') as f: ugly_xml = f.read()
from bs4 import BeautifulSoup
fields = ['given-names', 'surname', 'article-title', 'month','year'] data_dict = extract_xml_data(ugly_xml, fields)
data_dict
{'given-names': 'Hugh', 'surname': 'Nicholson', 'article-title': 'A Correlational Model of Comparative Theology', 'string-date': 'April 2005'}
import os
xml_files = os.listdir('Metadata')
xml_files
['journal-article-10.1086_421828.xml', 'journal-article-10.1086_381213.xml', 'journal-article-10.1086_424411.xml', 'journal-article-10.1086_427313.xml', 'journal-article-10.1086_382331.xml']
all_data = {} fields = ['given-names', 'surname', 'article-title', 'month','year', 'subject'] for each in xml_files: with open('Metadata/{}'.format(each), 'r') as f: ugly_xml = f.read() soup = BeautifulSoup(ugly_xml, 'xml') subject_value = soup.find('subject').get_text() if subject_value == "Book Review": pass else: all_data[each] = extract_xml_data(ugly_xml, fields)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-27-2db4f8998aa5> in <module> 8 soup = BeautifulSoup(ugly_xml, 'xml') 9 ---> 10 subject_value = soup.find('subject').get_text() 11 if subject_value == "Book Review": 12 pass AttributeError: 'NoneType' object has no attribute 'get_text'
#def check_for_review(): #Check the "subject" field, #If == "Book Review", then pass
all_data
{'journal-article-10.1086_421828.xml': {'given-names': 'David\xa0W.', 'surname': 'Chappell', 'article-title': '', 'month': '04', 'year': '2004'}, 'journal-article-10.1086_381213.xml': {'given-names': 'W.\xa0Clark', 'surname': 'Gilpin', 'article-title': 'Enlightened Genealogies of Religion: Edward Gibbon and His Contemporaries*', 'month': '04', 'year': '2004'}, 'journal-article-10.1086_424411.xml': {'given-names': 'Nicholas', 'surname': 'Koss', 'article-title': '', 'month': '07', 'year': '2004'}, 'journal-article-10.1086_427313.xml': {'given-names': 'Hugh', 'surname': 'Nicholson', 'article-title': 'A Correlational Model of Comparative Theology', 'month': '04', 'year': '2005'}, 'journal-article-10.1086_382331.xml': {'given-names': 'Robert\xa0Ford', 'surname': 'Campany', 'article-title': '', 'month': '01', 'year': '2004'}}