{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Defaulting to user installation because normal site-packages is not writeable\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: pandas_read_xml in /home/user/.local/lib/python3.7/site-packages (0.3.1)\r\n", "Requirement already satisfied: pyarrow in /home/user/.local/lib/python3.7/site-packages (from pandas_read_xml) (3.0.0)\r\n", "Requirement already satisfied: requests in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas_read_xml) (2.24.0)\r\n", "Requirement already satisfied: zipfile36 in /home/user/.local/lib/python3.7/site-packages (from pandas_read_xml) (0.1.3)\r\n", "Requirement already satisfied: distlib in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas_read_xml) (0.3.1)\r\n", "Requirement already satisfied: xmltodict in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas_read_xml) (0.12.0)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: pandas in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas_read_xml) (1.1.5)\r\n", "Requirement already satisfied: urllib3>=1.26.3 in /home/user/.local/lib/python3.7/site-packages (from pandas_read_xml) (1.26.4)\r\n", "Requirement already satisfied: numpy>=1.16.6 in /ext/anaconda2020.02/lib/python3.7/site-packages (from pyarrow->pandas_read_xml) (1.18.5)\r\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /ext/anaconda2020.02/lib/python3.7/site-packages (from requests->pandas_read_xml) (3.0.4)\r\n", "Requirement already satisfied: idna<3,>=2.5 in /ext/anaconda2020.02/lib/python3.7/site-packages (from requests->pandas_read_xml) (2.8)\r\n", "Requirement already satisfied: certifi>=2017.4.17 in /ext/anaconda2020.02/lib/python3.7/site-packages (from requests->pandas_read_xml) (2020.12.5)\r\n", "Requirement already satisfied: python-dateutil>=2.7.3 in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas->pandas_read_xml) (2.8.0)\r\n", "Requirement already satisfied: pytz>=2017.2 in /ext/anaconda2020.02/lib/python3.7/site-packages (from pandas->pandas_read_xml) (2019.3)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: six>=1.5 in /ext/anaconda2020.02/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas->pandas_read_xml) (1.14.0)\r\n" ] } ], "source": [ "!pip install pandas_read_xml" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "test_xml = \"\"\"\n", "\n", "\n", " \n", " Java For Dummies\n", " Tan Ah Teck\n", " Programming\n", " 2009\n", " 7\n", " 19.99\n", " \n", " \n", " More Java For Dummies\n", " Tan Ah Teck\n", " Programming\n", " 2008\n", " 25.99\n", " \n", " \n", " The Complete Guide to Fishing\n", " Bill Jones\n", " James Cook\n", " Mary Turing\n", " Fishing\n", " Leisure\n", " French\n", " 2000\n", " 2\n", " 49.99\n", " \n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/ext/anaconda2020.02/lib/python3.7/site-packages/requests/__init__.py:91: RequestsDependencyWarning: urllib3 (1.26.4) or chardet (3.0.4) doesn't match a supported version!\n", " RequestsDependencyWarning)\n" ] } ], "source": [ "import pandas_read_xml as pdxi\n", "from pandas_read_xml import flatten, fully_flatten, auto_separate_tables" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
book
0[{'@ISBN': '0123456001', 'title': 'Java For Du...
\n", "
" ], "text/plain": [ " book\n", "0 [{'@ISBN': '0123456001', 'title': 'Java For Du..." ] }, "execution_count": 4, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df = pdxi.read_xml(test_xml, ['bookstore'])\n", " \n", "df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
book
0{'@ISBN': '0123456001', 'title': 'Java For Dum...
1{'@ISBN': '0123456002', 'title': 'More Java Fo...
2{'@ISBN': '0123456010', 'title': 'The Complete...
\n", "
" ], "text/plain": [ " book\n", "0 {'@ISBN': '0123456001', 'title': 'Java For Dum...\n", "1 {'@ISBN': '0123456002', 'title': 'More Java Fo...\n", "2 {'@ISBN': '0123456010', 'title': 'The Complete..." ] }, "execution_count": 5, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df = df.pipe(flatten)\n", " \n", "df" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
book|@ISBNbook|titlebook|authorbook|categorybook|yearbook|editionbook|pricebook|language
00123456001Java For DummiesTan Ah TeckProgramming2009719.99NaN
10123456002More Java For DummiesTan Ah TeckProgramming2008NaN25.99NaN
20123456010The Complete Guide to Fishing[Bill Jones, James Cook, Mary Turing][Fishing, Leisure]2000249.99French
\n", "
" ], "text/plain": [ " book|@ISBN book|title \\\n", "0 0123456001 Java For Dummies \n", "1 0123456002 More Java For Dummies \n", "2 0123456010 The Complete Guide to Fishing \n", "\n", " book|author book|category book|year \\\n", "0 Tan Ah Teck Programming 2009 \n", "1 Tan Ah Teck Programming 2008 \n", "2 [Bill Jones, James Cook, Mary Turing] [Fishing, Leisure] 2000 \n", "\n", " book|edition book|price book|language \n", "0 7 19.99 NaN \n", "1 NaN 25.99 NaN \n", "2 2 49.99 French " ] }, "execution_count": 6, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df = df.pipe(flatten)\n", " \n", "df" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "key_columns = ['book|@ISBN']\n", " \n", "data = df.pipe(auto_separate_tables, key_columns)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "dict_keys(['author', 'category', 'book'])" ] }, "execution_count": 9, "metadata": { }, "output_type": "execute_result" } ], "source": [ "data.keys()\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
@ISBNauthor
00123456010Bill Jones
10123456010James Cook
20123456010Mary Turing
30123456001Tan Ah Teck
40123456002Tan Ah Teck
\n", "
" ], "text/plain": [ " @ISBN author\n", "0 0123456010 Bill Jones\n", "1 0123456010 James Cook\n", "2 0123456010 Mary Turing\n", "3 0123456001 Tan Ah Teck\n", "4 0123456002 Tan Ah Teck" ] }, "execution_count": 10, "metadata": { }, "output_type": "execute_result" } ], "source": [ "data['author']" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (Anaconda 2020)", "env": { "ADDR2LINE": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-addr2line", "ANACONDA2019": "$EXT/anaconda-2019.03", "ANACONDA2020": "/ext/anaconda2020.02", "ANACONDA3": "$EXT/anaconda3", "ANACONDA5": "$EXT/anaconda5", "AR": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-ar", "AS": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-as", "CC": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-cc", "CFLAGS": "-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include", "CMAKE_PREFIX_PATH": "/ext/anaconda2020.02:/ext/anaconda2020.02/x86_64-conda_cos6-linux-gnu/sysroot/usr", "CONDA_BACKUP_HOST": "x86_64-conda_cos6-linux-gnu", "CONDA_BUILD_SYSROOT": "/ext/anaconda2020.02/x86_64-conda_cos6-linux-gnu/sysroot", "CONDA_DEFAULT_ENV": "base", "CONDA_EXE": "/ext/anaconda2020.02/bin/conda", "CONDA_MKL_INTERFACE_LAYER_BACKUP": "", "CONDA_PREFIX": "/ext/anaconda2020.02", "CONDA_PROMPT_MODIFIER": "(base) ", "CONDA_PYTHON_EXE": "/ext/anaconda2020.02/bin/python", "CONDA_SHLVL": "1", "CPP": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-cpp", "CPPFLAGS": "-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /ext/anaconda2020.02/include", "CXX": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-c++", "CXXFILT": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-c++filt", "CXXFLAGS": "-fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include", "DEBUG_CFLAGS": "-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include", "DEBUG_CPPFLAGS": "-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /ext/anaconda2020.02/include", "DEBUG_CXXFLAGS": "-fvisibility-inlines-hidden -std=c++17 -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include", "DEBUG_FFLAGS": "-fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include -fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fcheck=all -fbacktrace -fimplicit-none -fvar-tracking-assignments -ffunction-sections -pipe", "DEBUG_FORTRANFLAGS": "-fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include -fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fcheck=all -fbacktrace -fimplicit-none -fvar-tracking-assignments -ffunction-sections -pipe", "ELFEDIT": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-elfedit", "F77": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gfortran", "F90": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gfortran", "F95": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-f95", "FC": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gfortran", "FFLAGS": "-fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include", "FORTRANFLAGS": "-fopenmp -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /ext/anaconda2020.02/include", "GCC": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gcc", "GCC_AR": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gcc-ar", "GCC_NM": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gcc-nm", "GCC_RANLIB": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gcc-ranlib", "GDAL_DATA": "/ext/anaconda2020.02/share/gdal", "GFORTRAN": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gfortran", "GPROF": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-gprof", "GXX": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-g++", "JAVA_HOME": "/ext/anaconda2020.02", "JAVA_HOME_CONDA_BACKUP": "", "JAVA_LD_LIBRARY_PATH": "/ext/anaconda2020.02/lib/server", "LD": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-ld", "LDFLAGS": "-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,-rpath,/ext/anaconda2020.02/lib -Wl,-rpath-link,/ext/anaconda2020.02/lib -L/ext/anaconda2020.02/lib", "LD_GOLD": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-ld.gold", "LD_LIBRARY_PATH": "/ext/anaconda2020.02/lib", "MKL_INTERFACE_LAYER": "LP64,GNU", "MKL_THREADING_LAYER": "GNU", "NM": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-nm", "OBJCOPY": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-objcopy", "OBJDUMP": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-objdump", "OLDPWD": "/ext/anaconda2020.02", "PATH": "/ext/anaconda2020.02/bin:/ext/anaconda2020.02/bin:/ext/anaconda2020.02/condabin:/cocalc/bin:/cocalc/src/smc-project/bin:/home/salvus/bin:/home/salvus/.local/bin:/usr/lib/xpra:/ext/bin:/opt/ghc/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/snap/bin:/usr/lib/postgresql/10/bin:/ext/data/homer/bin:/ext/data/weblogo", "PROJ_LIB": "/ext/anaconda2020.02/share/proj", "RANLIB": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-ranlib", "READELF": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-readelf", "RSTUDIO_WHICH_R": "/ext/anaconda2020.02/bin/R", "SIZE": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-size", "STRINGS": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-strings", "STRIP": "/ext/anaconda2020.02/bin/x86_64-conda_cos6-linux-gnu-strip", "_CE_CONDA": "", "_CE_M": "", "_CONDA_PYTHON_SYSCONFIGDATA_NAME": "_sysconfigdata_x86_64_conda_cos6_linux_gnu" }, "language": "python", "metadata": { "cocalc": { "description": "Python/R distribution for data science", "priority": 5, "url": "https://www.anaconda.com/distribution/" } }, "name": "anaconda2020", "resource_dir": "/ext/jupyter/kernels/anaconda2020" } }, "nbformat": 4, "nbformat_minor": 4 }