{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Defaulting to user installation because normal site-packages is not writeable\r\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pandas_read_xml in /home/user/.local/lib/python3.7/site-packages (0.3.1)\r\n",
"Requirement already satisfied: pyarrow in /home/user/.local/lib/python3.7/site-packages (from pandas_read_xml) (3.0.0)\r\n",
"Requirement already satisfied: requests in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas_read_xml) (2.24.0)\r\n",
"Requirement already satisfied: zipfile36 in /home/user/.local/lib/python3.7/site-packages (from pandas_read_xml) (0.1.3)\r\n",
"Requirement already satisfied: distlib in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas_read_xml) (0.3.1)\r\n",
"Requirement already satisfied: xmltodict in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas_read_xml) (0.12.0)\r\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pandas in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas_read_xml) (1.1.5)\r\n",
"Requirement already satisfied: urllib3>=1.26.3 in /home/user/.local/lib/python3.7/site-packages (from pandas_read_xml) (1.26.4)\r\n",
"Requirement already satisfied: numpy>=1.16.6 in /ext/anaconda2020.02/lib/python3.7/site-packages (from pyarrow->pandas_read_xml) (1.18.5)\r\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /ext/anaconda2020.02/lib/python3.7/site-packages (from requests->pandas_read_xml) (3.0.4)\r\n",
"Requirement already satisfied: idna<3,>=2.5 in /ext/anaconda2020.02/lib/python3.7/site-packages (from requests->pandas_read_xml) (2.8)\r\n",
"Requirement already satisfied: certifi>=2017.4.17 in /ext/anaconda2020.02/lib/python3.7/site-packages (from requests->pandas_read_xml) (2020.12.5)\r\n",
"Requirement already satisfied: python-dateutil>=2.7.3 in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas->pandas_read_xml) (2.8.0)\r\n",
"Requirement already satisfied: pytz>=2017.2 in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas->pandas_read_xml) (2019.3)\r\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: six>=1.5 in /ext/anaconda2020.02/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas->pandas_read_xml) (1.14.0)\r\n"
]
}
],
"source": [
"!pip install pandas_read_xml"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"test_xml = \"\"\"\n",
"\n",
"\n",
" \n",
" Java For Dummies\n",
" Tan Ah Teck\n",
" Programming\n",
" 2009\n",
" 7\n",
" 19.99\n",
" \n",
" \n",
" More Java For Dummies\n",
" Tan Ah Teck\n",
" Programming\n",
" 2008\n",
" 25.99\n",
" \n",
" \n",
" The Complete Guide to Fishing\n",
" Bill Jones\n",
" James Cook\n",
" Mary Turing\n",
" Fishing\n",
" Leisure\n",
" French\n",
" 2000\n",
" 2\n",
" 49.99\n",
" \n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/ext/anaconda2020.02/lib/python3.7/site-packages/requests/__init__.py:91: RequestsDependencyWarning: urllib3 (1.26.4) or chardet (3.0.4) doesn't match a supported version!\n",
" RequestsDependencyWarning)\n"
]
}
],
"source": [
"import pandas_read_xml as pdxi\n",
"from pandas_read_xml import flatten, fully_flatten, auto_separate_tables"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" book | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" [{'@ISBN': '0123456001', 'title': 'Java For Du... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" book\n",
"0 [{'@ISBN': '0123456001', 'title': 'Java For Du..."
]
},
"execution_count": 4,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df = pdxi.read_xml(test_xml, ['bookstore'])\n",
" \n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" book | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" {'@ISBN': '0123456001', 'title': 'Java For Dum... | \n",
"
\n",
" \n",
" 1 | \n",
" {'@ISBN': '0123456002', 'title': 'More Java Fo... | \n",
"
\n",
" \n",
" 2 | \n",
" {'@ISBN': '0123456010', 'title': 'The Complete... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" book\n",
"0 {'@ISBN': '0123456001', 'title': 'Java For Dum...\n",
"1 {'@ISBN': '0123456002', 'title': 'More Java Fo...\n",
"2 {'@ISBN': '0123456010', 'title': 'The Complete..."
]
},
"execution_count": 5,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df = df.pipe(flatten)\n",
" \n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" book|@ISBN | \n",
" book|title | \n",
" book|author | \n",
" book|category | \n",
" book|year | \n",
" book|edition | \n",
" book|price | \n",
" book|language | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0123456001 | \n",
" Java For Dummies | \n",
" Tan Ah Teck | \n",
" Programming | \n",
" 2009 | \n",
" 7 | \n",
" 19.99 | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" 0123456002 | \n",
" More Java For Dummies | \n",
" Tan Ah Teck | \n",
" Programming | \n",
" 2008 | \n",
" NaN | \n",
" 25.99 | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" 0123456010 | \n",
" The Complete Guide to Fishing | \n",
" [Bill Jones, James Cook, Mary Turing] | \n",
" [Fishing, Leisure] | \n",
" 2000 | \n",
" 2 | \n",
" 49.99 | \n",
" French | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" book|@ISBN book|title \\\n",
"0 0123456001 Java For Dummies \n",
"1 0123456002 More Java For Dummies \n",
"2 0123456010 The Complete Guide to Fishing \n",
"\n",
" book|author book|category book|year \\\n",
"0 Tan Ah Teck Programming 2009 \n",
"1 Tan Ah Teck Programming 2008 \n",
"2 [Bill Jones, James Cook, Mary Turing] [Fishing, Leisure] 2000 \n",
"\n",
" book|edition book|price book|language \n",
"0 7 19.99 NaN \n",
"1 NaN 25.99 NaN \n",
"2 2 49.99 French "
]
},
"execution_count": 6,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"df = df.pipe(flatten)\n",
" \n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
"key_columns = ['book|@ISBN']\n",
" \n",
"data = df.pipe(auto_separate_tables, key_columns)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['author', 'category', 'book'])"
]
},
"execution_count": 9,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"data.keys()\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" @ISBN | \n",
" author | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0123456010 | \n",
" Bill Jones | \n",
"
\n",
" \n",
" 1 | \n",
" 0123456010 | \n",
" James Cook | \n",
"
\n",
" \n",
" 2 | \n",
" 0123456010 | \n",
" Mary Turing | \n",
"
\n",
" \n",
" 3 | \n",
" 0123456001 | \n",
" Tan Ah Teck | \n",
"
\n",
" \n",
" 4 | \n",
" 0123456002 | \n",
" Tan Ah Teck | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" @ISBN author\n",
"0 0123456010 Bill Jones\n",
"1 0123456010 James Cook\n",
"2 0123456010 Mary Turing\n",
"3 0123456001 Tan Ah Teck\n",
"4 0123456002 Tan Ah Teck"
]
},
"execution_count": 10,
"metadata": {
},
"output_type": "execute_result"
}
],
"source": [
"data['author']"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"collapsed": false
},
"outputs": [
],
"source": [
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (Anaconda 2020)",
"env": {
"ADDR2LINE": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-addr2line",
"ANACONDA2019": "$EXT/anaconda-2019.03",
"ANACONDA2020": "/ext/anaconda2020.02",
"ANACONDA3": "$EXT/anaconda3",
"ANACONDA5": "$EXT/anaconda5",
"AR": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-ar",
"AS": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-as",
"CC": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-cc",
"CFLAGS": "-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include",
"CMAKE_PREFIX_PATH": "/ext/anaconda2020.02:/ext/anaconda2020.02/x86_64-conda_cos6-linux-gnu/sysroot/usr",
"CONDA_BACKUP_HOST": "x86_64-conda_cos6-linux-gnu",
"CONDA_BUILD_SYSROOT": "/ext/anaconda2020.02/x86_64-conda_cos6-linux-gnu/sysroot",
"CONDA_DEFAULT_ENV": "base",
"CONDA_EXE": "/ext/anaconda2020.02/bin/conda",
"CONDA_MKL_INTERFACE_LAYER_BACKUP": "",
"CONDA_PREFIX": "/ext/anaconda2020.02",
"CONDA_PROMPT_MODIFIER": "(base) ",
"CONDA_PYTHON_EXE": "/ext/anaconda2020.02/bin/python",
"CONDA_SHLVL": "1",
"CPP": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-cpp",
"CPPFLAGS": "-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /ext/anaconda2020.02/include",
"CXX": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-c++",
"CXXFILT": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-c++filt",
"CXXFLAGS": "-fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include",
"DEBUG_CFLAGS": "-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include",
"DEBUG_CPPFLAGS": "-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /ext/anaconda2020.02/include",
"DEBUG_CXXFLAGS": "-fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include",
"DEBUG_FFLAGS": "-fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include -fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fcheck=all -fbacktrace -fimplicit-none -fvar-tracking-assignments -ffunction-sections -pipe",
"DEBUG_FORTRANFLAGS": "-fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include -fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fcheck=all -fbacktrace -fimplicit-none -fvar-tracking-assignments -ffunction-sections -pipe",
"ELFEDIT": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-elfedit",
"F77": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gfortran",
"F90": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gfortran",
"F95": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-f95",
"FC": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gfortran",
"FFLAGS": "-fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include",
"FORTRANFLAGS": "-fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include",
"GCC": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gcc",
"GCC_AR": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gcc-ar",
"GCC_NM": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gcc-nm",
"GCC_RANLIB": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gcc-ranlib",
"GDAL_DATA": "/ext/anaconda2020.02/share/gdal",
"GFORTRAN": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gfortran",
"GPROF": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gprof",
"GXX": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-g++",
"JAVA_HOME": "/ext/anaconda2020.02",
"JAVA_HOME_CONDA_BACKUP": "",
"JAVA_LD_LIBRARY_PATH": "/ext/anaconda2020.02/lib/server",
"LD": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-ld",
"LDFLAGS": "-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,-rpath,/ext/anaconda2020.02/lib -Wl,-rpath-link,/ext/anaconda2020.02/lib -L/ext/anaconda2020.02/lib",
"LD_GOLD": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-ld.gold",
"LD_LIBRARY_PATH": "/ext/anaconda2020.02/lib",
"MKL_INTERFACE_LAYER": "LP64,GNU",
"MKL_THREADING_LAYER": "GNU",
"NM": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-nm",
"OBJCOPY": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-objcopy",
"OBJDUMP": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-objdump",
"OLDPWD": "/ext/anaconda2020.02",
"PATH": "/ext/anaconda2020.02/bin:/ext/anaconda2020.02/bin:/ext/anaconda2020.02/condabin:/cocalc/bin:/cocalc/src/smc-project/bin:/home/salvus/bin:/home/salvus/.local/bin:/usr/lib/xpra:/ext/bin:/opt/ghc/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin:/usr/lib/postgresql/10/bin:/ext/data/homer/bin:/ext/data/weblogo",
"PROJ_LIB": "/ext/anaconda2020.02/share/proj",
"RANLIB": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-ranlib",
"READELF": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-readelf",
"RSTUDIO_WHICH_R": "/ext/anaconda2020.02/bin/R",
"SIZE": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-size",
"STRINGS": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-strings",
"STRIP": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-strip",
"_CE_CONDA": "",
"_CE_M": "",
"_CONDA_PYTHON_SYSCONFIGDATA_NAME": "_sysconfigdata_x86_64_conda_cos6_linux_gnu"
},
"language": "python",
"metadata": {
"cocalc": {
"description": "Python/R distribution for data science",
"priority": 5,
"url": "https://www.anaconda.com/distribution/"
}
},
"name": "anaconda2020",
"resource_dir": "/ext/jupyter/kernels/anaconda2020"
}
},
"nbformat": 4,
"nbformat_minor": 4
}