Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download
Views: 17875
Image: ubuntu2004
Kernel: Python 3 (Anaconda 2020)
!pip install pandas_read_xml
Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: pandas_read_xml in /home/user/.local/lib/python3.7/site-packages (0.3.1) Requirement already satisfied: zipfile36 in /home/user/.local/lib/python3.7/site-packages (from pandas_read_xml) (0.1.3) Requirement already satisfied: distlib in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas_read_xml) (0.3.1) Requirement already satisfied: pandas in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas_read_xml) (1.1.5) Requirement already satisfied: urllib3>=1.26.3 in /home/user/.local/lib/python3.7/site-packages (from pandas_read_xml) (1.26.4) Requirement already satisfied: requests in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas_read_xml) (2.24.0) Requirement already satisfied: xmltodict in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas_read_xml) (0.12.0) Requirement already satisfied: pyarrow in /home/user/.local/lib/python3.7/site-packages (from pandas_read_xml) (3.0.0) Requirement already satisfied: python-dateutil>=2.7.3 in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas->pandas_read_xml) (2.8.0) Requirement already satisfied: pytz>=2017.2 in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas->pandas_read_xml) (2019.3) Requirement already satisfied: numpy>=1.15.4 in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas->pandas_read_xml) (1.18.5) Requirement already satisfied: idna<3,>=2.5 in /ext/anaconda2020.02/lib/python3.7/site-packages (from requests->pandas_read_xml) (2.8) Requirement already satisfied: certifi>=2017.4.17 in /ext/anaconda2020.02/lib/python3.7/site-packages (from requests->pandas_read_xml) (2020.12.5) Requirement already satisfied: chardet<4,>=3.0.2 in /ext/anaconda2020.02/lib/python3.7/site-packages (from requests->pandas_read_xml) (3.0.4) Requirement already satisfied: six>=1.5 in /ext/anaconda2020.02/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas->pandas_read_xml) (1.14.0)
test_xml = """<?xml version="1.0" encoding="UTF-8"?> <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.0" xml:lang="en"> <front> <journal-meta> <journal-id journal-id-type="publisher-id">jreligion</journal-id> <journal-title-group> <journal-title>The Journal of Religion</journal-title> </journal-title-group> <publisher> <publisher-name>The University of Chicago Press</publisher-name> </publisher> <issn pub-type="ppub">00224189</issn> <issn pub-type="epub">15496538</issn> <custom-meta-group/> </journal-meta> <article-meta> <article-id pub-id-type="jstor-stable">4625926</article-id> <article-id pub-id-type="doi">10.1086/522275</article-id> <article-id pub-id-type="msid">JR0700335</article-id> <title-group> <article-title>A Critique of Gordon Kaufman’s Theological Method, with Special Reference to His Theory of Religion</article-title> </title-group> <contrib-group> <contrib contrib-type="author" xlink:type="simple"> <string-name> <given-names>Joshua</given-names> <x xml:space="preserve"> </x> <surname>Braley</surname> </string-name> </contrib> <aff id="aff_1">Santa Fe Community College</aff> </contrib-group> <pub-date pub-type="ppub"> <month>01</month> <year>2008</year> <string-date>January 2008</string-date> </pub-date> <volume>88</volume> <issue>1</issue> <issue-id>522213</issue-id> <fpage>29</fpage> <lpage>52</lpage> <permissions> <copyright-statement>© 2008 by The University of Chicago. All rights reserved.</copyright-statement> <copyright-year>2008</copyright-year> <copyright-holder>The University of Chicago</copyright-holder> </permissions> <self-uri xlink:href="https://www.jstor.org/stable/10.1086/522275"/> <custom-meta-group> <custom-meta> <meta-name>lang</meta-name> <meta-value>en</meta-value> </custom-meta> </custom-meta-group> </article-meta> <notes notes-type="epigraph"> <disp-quote> <p>But then, you’ll say, God and religion are the same! (Jonathan Edwards)</p> </disp-quote> </notes> </front> <back> </back> </article> """
import pandas_read_xml as pdxi from pandas_read_xml import flatten, fully_flatten, auto_separate_tables
/ext/anaconda2020.02/lib/python3.7/site-packages/requests/__init__.py:91: RequestsDependencyWarning: urllib3 (1.26.4) or chardet (3.0.4) doesn't match a supported version! RequestsDependencyWarning)
df = pdxi.read_xml(test_xml, ['article']) df
@xmlns:xlink @xmlns:mml @xmlns:xsi @article-type @dtd-version @xml:lang front back
0 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en {'journal-meta': {'journal-id': {'@journal-id-... None
df = df.pipe(flatten) df
@xmlns:xlink @xmlns:mml @xmlns:xsi @article-type @dtd-version @xml:lang back front|journal-meta front|article-meta front|notes
0 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None {'journal-id': {'@journal-id-type': 'publisher... {'article-id': [{'@pub-id-type': 'jstor-stable... {'@notes-type': 'epigraph', 'disp-quote': {'p'...
df = df.pipe(flatten) df
@xmlns:xlink @xmlns:mml @xmlns:xsi @article-type @dtd-version @xml:lang back front|journal-meta|journal-id front|journal-meta|journal-title-group front|journal-meta|publisher ... front|article-meta|volume front|article-meta|issue front|article-meta|issue-id front|article-meta|fpage front|article-meta|lpage front|article-meta|permissions front|article-meta|self-uri front|article-meta|custom-meta-group front|notes|@notes-type front|notes|disp-quote
0 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None {'@journal-id-type': 'publisher-id', '#text': ... {'journal-title': 'The Journal of Religion'} {'publisher-name': 'The University of Chicago ... ... 88 1 522213 29 52 {'copyright-statement': '© 2008 by The Univers... {'@xlink:href': 'https://www.jstor.org/stable/... {'custom-meta': {'meta-name': 'lang', 'meta-va... epigraph {'p': 'But then, you’ll say, God and religion ...

1 rows × 26 columns

df = df.pipe(flatten) df
@xmlns:xlink @xmlns:mml @xmlns:xsi @article-type @dtd-version @xml:lang back front|journal-meta|issn front|journal-meta|custom-meta-group front|article-meta|article-id ... front|article-meta|pub-date|@pub-type front|article-meta|pub-date|month front|article-meta|pub-date|year front|article-meta|pub-date|string-date front|article-meta|permissions|copyright-statement front|article-meta|permissions|copyright-year front|article-meta|permissions|copyright-holder front|article-meta|self-uri|@xlink:href front|article-meta|custom-meta-group|custom-meta front|notes|disp-quote|p
0 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None {'@pub-type': 'ppub', '#text': '00224189'} None {'@pub-id-type': 'jstor-stable', '#text': '462... ... ppub 01 2008 January 2008 © 2008 by The University of Chicago. All right... 2008 The University of Chicago https://www.jstor.org/stable/10.1086/522275 {'meta-name': 'lang', 'meta-value': 'en'} But then, you’ll say, God and religion are the...
1 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None {'@pub-type': 'ppub', '#text': '00224189'} None {'@pub-id-type': 'doi', '#text': '10.1086/5222... ... ppub 01 2008 January 2008 © 2008 by The University of Chicago. All right... 2008 The University of Chicago https://www.jstor.org/stable/10.1086/522275 {'meta-name': 'lang', 'meta-value': 'en'} But then, you’ll say, God and religion are the...
2 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None {'@pub-type': 'ppub', '#text': '00224189'} None {'@pub-id-type': 'msid', '#text': 'JR0700335'} ... ppub 01 2008 January 2008 © 2008 by The University of Chicago. All right... 2008 The University of Chicago https://www.jstor.org/stable/10.1086/522275 {'meta-name': 'lang', 'meta-value': 'en'} But then, you’ll say, God and religion are the...
3 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None {'@pub-type': 'epub', '#text': '15496538'} None {'@pub-id-type': 'jstor-stable', '#text': '462... ... ppub 01 2008 January 2008 © 2008 by The University of Chicago. All right... 2008 The University of Chicago https://www.jstor.org/stable/10.1086/522275 {'meta-name': 'lang', 'meta-value': 'en'} But then, you’ll say, God and religion are the...
4 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None {'@pub-type': 'epub', '#text': '15496538'} None {'@pub-id-type': 'doi', '#text': '10.1086/5222... ... ppub 01 2008 January 2008 © 2008 by The University of Chicago. All right... 2008 The University of Chicago https://www.jstor.org/stable/10.1086/522275 {'meta-name': 'lang', 'meta-value': 'en'} But then, you’ll say, God and religion are the...
5 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None {'@pub-type': 'epub', '#text': '15496538'} None {'@pub-id-type': 'msid', '#text': 'JR0700335'} ... ppub 01 2008 January 2008 © 2008 by The University of Chicago. All right... 2008 The University of Chicago https://www.jstor.org/stable/10.1086/522275 {'meta-name': 'lang', 'meta-value': 'en'} But then, you’ll say, God and religion are the...

6 rows × 33 columns

df = df.pipe(flatten) df
@xmlns:xlink @xmlns:mml @xmlns:xsi @article-type @dtd-version @xml:lang back front|journal-meta|custom-meta-group front|article-meta|volume front|article-meta|issue ... front|journal-meta|issn front|article-meta|article-id|@pub-id-type front|article-meta|article-id front|article-meta|contrib-group|contrib|@contrib-type front|article-meta|contrib-group|contrib|@xlink:type front|article-meta|contrib-group|contrib|string-name front|article-meta|contrib-group|aff|@id front|article-meta|contrib-group|aff front|article-meta|custom-meta-group|custom-meta|meta-name front|article-meta|custom-meta-group|custom-meta|meta-value
0 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 00224189 jstor-stable 4625926 author simple {'given-names': 'Joshua', 'x': {'@xml:space': ... aff_1 Santa Fe Community College lang en
1 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 00224189 doi 10.1086/522275 author simple {'given-names': 'Joshua', 'x': {'@xml:space': ... aff_1 Santa Fe Community College lang en
2 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 00224189 msid JR0700335 author simple {'given-names': 'Joshua', 'x': {'@xml:space': ... aff_1 Santa Fe Community College lang en
3 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 15496538 jstor-stable 4625926 author simple {'given-names': 'Joshua', 'x': {'@xml:space': ... aff_1 Santa Fe Community College lang en
4 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 15496538 doi 10.1086/522275 author simple {'given-names': 'Joshua', 'x': {'@xml:space': ... aff_1 Santa Fe Community College lang en
5 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 15496538 msid JR0700335 author simple {'given-names': 'Joshua', 'x': {'@xml:space': ... aff_1 Santa Fe Community College lang en

6 rows × 39 columns

df = df.pipe(flatten) df
@xmlns:xlink @xmlns:mml @xmlns:xsi @article-type @dtd-version @xml:lang back front|journal-meta|custom-meta-group front|article-meta|volume front|article-meta|issue ... front|article-meta|article-id front|article-meta|contrib-group|contrib|@contrib-type front|article-meta|contrib-group|contrib|@xlink:type front|article-meta|contrib-group|aff|@id front|article-meta|contrib-group|aff front|article-meta|custom-meta-group|custom-meta|meta-name front|article-meta|custom-meta-group|custom-meta|meta-value front|article-meta|contrib-group|contrib|string-name|given-names front|article-meta|contrib-group|contrib|string-name|x front|article-meta|contrib-group|contrib|string-name|surname
0 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 4625926 author simple aff_1 Santa Fe Community College lang en Joshua {'@xml:space': 'preserve'} Braley
1 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 10.1086/522275 author simple aff_1 Santa Fe Community College lang en Joshua {'@xml:space': 'preserve'} Braley
2 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... JR0700335 author simple aff_1 Santa Fe Community College lang en Joshua {'@xml:space': 'preserve'} Braley
3 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 4625926 author simple aff_1 Santa Fe Community College lang en Joshua {'@xml:space': 'preserve'} Braley
4 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 10.1086/522275 author simple aff_1 Santa Fe Community College lang en Joshua {'@xml:space': 'preserve'} Braley
5 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... JR0700335 author simple aff_1 Santa Fe Community College lang en Joshua {'@xml:space': 'preserve'} Braley

6 rows × 41 columns

df = df.pipe(flatten) df
@xmlns:xlink @xmlns:mml @xmlns:xsi @article-type @dtd-version @xml:lang back front|journal-meta|custom-meta-group front|article-meta|volume front|article-meta|issue ... front|article-meta|article-id front|article-meta|contrib-group|contrib|@contrib-type front|article-meta|contrib-group|contrib|@xlink:type front|article-meta|contrib-group|aff|@id front|article-meta|contrib-group|aff front|article-meta|custom-meta-group|custom-meta|meta-name front|article-meta|custom-meta-group|custom-meta|meta-value front|article-meta|contrib-group|contrib|string-name|given-names front|article-meta|contrib-group|contrib|string-name|surname front|article-meta|contrib-group|contrib|string-name|x|@xml:space
0 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 4625926 author simple aff_1 Santa Fe Community College lang en Joshua Braley preserve
1 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 10.1086/522275 author simple aff_1 Santa Fe Community College lang en Joshua Braley preserve
2 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... JR0700335 author simple aff_1 Santa Fe Community College lang en Joshua Braley preserve
3 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 4625926 author simple aff_1 Santa Fe Community College lang en Joshua Braley preserve
4 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 10.1086/522275 author simple aff_1 Santa Fe Community College lang en Joshua Braley preserve
5 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... JR0700335 author simple aff_1 Santa Fe Community College lang en Joshua Braley preserve

6 rows × 41 columns

df = df.pipe(flatten) df
@xmlns:xlink @xmlns:mml @xmlns:xsi @article-type @dtd-version @xml:lang back front|journal-meta|custom-meta-group front|article-meta|volume front|article-meta|issue ... front|article-meta|article-id front|article-meta|contrib-group|contrib|@contrib-type front|article-meta|contrib-group|contrib|@xlink:type front|article-meta|contrib-group|aff|@id front|article-meta|contrib-group|aff front|article-meta|custom-meta-group|custom-meta|meta-name front|article-meta|custom-meta-group|custom-meta|meta-value front|article-meta|contrib-group|contrib|string-name|given-names front|article-meta|contrib-group|contrib|string-name|surname front|article-meta|contrib-group|contrib|string-name|x|@xml:space
0 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 4625926 author simple aff_1 Santa Fe Community College lang en Joshua Braley preserve
1 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 10.1086/522275 author simple aff_1 Santa Fe Community College lang en Joshua Braley preserve
2 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... JR0700335 author simple aff_1 Santa Fe Community College lang en Joshua Braley preserve
3 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 4625926 author simple aff_1 Santa Fe Community College lang en Joshua Braley preserve
4 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... 10.1086/522275 author simple aff_1 Santa Fe Community College lang en Joshua Braley preserve
5 http://www.w3.org/1999/xlink http://www.w3.org/1998/Math/MathML http://www.w3.org/2001/XMLSchema-instance research-article 1.0 en None None 88 1 ... JR0700335 author simple aff_1 Santa Fe Community College lang en Joshua Braley preserve

6 rows × 41 columns

key_columns = ['front|article-meta|pub-date|string-date']
data = df.pipe(auto_separate_tables, key_columns)
data.keys()
dict_keys(['journal-meta|issn', 'article-meta|article-id', 'article-meta|contrib-group|contrib', 'article-meta|contrib-group|aff', 'article-meta|custom-meta-group|custom-meta', 'front'])
data['article-meta|contrib-group|aff']
article-meta|pub-date|string-date @id article-meta|contrib-group|aff
0 January 2008 aff_1 Santa Fe Community College
data['article-meta|article-id']
article-meta|pub-date|string-date @pub-id-type article-meta|article-id
0 January 2008 jstor-stable 4625926
1 January 2008 doi 10.1086/522275
2 January 2008 msid JR0700335