CoCalc Shared Filesspacy-test.ipynbOpen in CoCalc with one click!
Author: Harald Schilly
Views : 49
Description: testing spacy

spaCy in CoCalc

Select the "Python 3 Ubuntu Linux" kernel

https://spacy.io/usage/linguistic-features

In [1]:
import spacy spacy.__version__
'2.0.12'

symbolic links to global data files

In [2]:
ls -l /usr/local/lib/python3.?/dist-packages/spacy/data
total 12 lrwxrwxrwx 1 root staff 31 Jul 27 14:50 de -> /ext/data/spacy/de_core_news_sm/ lrwxrwxrwx 1 root staff 30 Jul 27 14:47 en -> /ext/data/spacy/en_core_web_sm/ lrwxrwxrwx 1 root staff 30 Jul 27 14:57 xx -> /ext/data/spacy/xx_ent_wiki_sm/
In [3]:
ls /ext/data/spacy/en_core_web_sm
__init__.py __pycache__/ en_core_web_sm-2.0.0/ meta.json
In [4]:
nlp = spacy.load('en')
In [5]:
text = u""" The SDG Index and Dashboards Report provides a report card for country performance on the historic Agenda 2030 and the Sustainable Development Goals (SDGs). The annual report shows how leaders can deliver on their promise and it urges countries not to lose the momentum for important reforms. It is produced by the Sustainable Development Solutions Network (SDSN), the Deutsche Bank and the Bertelsmann Stiftung. """
In [6]:
doc = nlp(text)
In [7]:
for entity in doc.ents: print(entity.text, entity.label_)
GPE Dashboards Report PERSON Agenda 2030 PRODUCT the Sustainable Development Goals (SDGs ORG GPE annual DATE GPE the Sustainable Development Solutions Network ORG SDSN ORG the Deutsche Bank ORG the Bertelsmann Stiftung ORG GPE
In [8]:
doc.print_tree()
[{'NE': '', 'POS_coarse': 'VERB', 'POS_fine': 'VBZ', 'arc': 'ROOT', 'lemma': 'provide', 'modifiers': [{'NE': 'PERSON', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'arc': 'nsubj', 'lemma': 'dashboards', 'modifiers': [{'NE': '', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'arc': 'det', 'lemma': 'the', 'modifiers': [{'NE': 'GPE', 'POS_coarse': '', 'POS_fine': '', 'arc': '', 'lemma': '\n', 'modifiers': [], 'word': '\n'}], 'word': 'The'}, {'NE': '', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'arc': 'nmod', 'lemma': 'index', 'modifiers': [{'NE': '', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'arc': 'compound', 'lemma': 'sdg', 'modifiers': [], 'word': 'SDG'}, {'NE': '', 'POS_coarse': 'CCONJ', 'POS_fine': 'CC', 'arc': 'cc', 'lemma': 'and', 'modifiers': [], 'word': 'and'}], 'word': 'Index'}], 'word': 'Dashboards Report'}, {'NE': '', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'arc': 'dobj', 'lemma': 'card', 'modifiers': [{'NE': '', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'arc': 'det', 'lemma': 'a', 'modifiers': [], 'word': 'a'}, {'NE': '', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'arc': 'compound', 'lemma': 'report', 'modifiers': [], 'word': 'report'}, {'NE': '', 'POS_coarse': 'ADP', 'POS_fine': 'IN', 'arc': 'prep', 'lemma': 'for', 'modifiers': [{'NE': '', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'arc': 'pobj', 'lemma': 'performance', 'modifiers': [{'NE': '', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'arc': 'compound', 'lemma': 'country', 'modifiers': [], 'word': 'country'}, {'NE': '', 'POS_coarse': 'ADP', 'POS_fine': 'IN', 'arc': 'prep', 'lemma': 'on', 'modifiers': [{'NE': 'PRODUCT', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'arc': 'pobj', 'lemma': 'agenda', 'modifiers': [{'NE': '', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'arc': 'det', 'lemma': 'the', 'modifiers': [], 'word': 'the'}, {'NE': '', 'POS_coarse': 'ADJ', 'POS_fine': 'JJ', 'arc': 'amod', 'lemma': 'historic', 'modifiers': [], 'word': 'historic'}, {'NE': '', 'POS_coarse': 'CCONJ', 'POS_fine': 'CC', 'arc': 'cc', 'lemma': 'and', 'modifiers': [], 'word': 'and'}, {'NE': 'ORG', 'POS_coarse': 'PROPN', 'POS_fine': 'NNPS', 'arc': 'conj', 'lemma': 'the', 'modifiers': [{'NE': '', 'POS_coarse': 'PUNCT', 'POS_fine': '-RRB-', 'arc': 'punct', 'lemma': ')', 'modifiers': [], 'word': ')'}], 'word': 'the Sustainable Development Goals ( SDGs'}], 'word': 'Agenda 2030'}], 'word': 'on'}], 'word': 'performance'}], 'word': 'for'}], 'word': 'card'}, {'NE': '', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'arc': 'punct', 'lemma': '.', 'modifiers': [{'NE': 'GPE', 'POS_coarse': '', 'POS_fine': '', 'arc': '', 'lemma': '\n', 'modifiers': [], 'word': '\n'}], 'word': '.'}], 'word': 'provides'}, {'NE': '', 'POS_coarse': 'VERB', 'POS_fine': 'VBZ', 'arc': 'ROOT', 'lemma': 'show', 'modifiers': [{'NE': '', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'arc': 'nsubj', 'lemma': 'report', 'modifiers': [{'NE': '', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'arc': 'det', 'lemma': 'the', 'modifiers': [], 'word': 'The'}, {'NE': 'DATE', 'POS_coarse': 'ADJ', 'POS_fine': 'JJ', 'arc': 'amod', 'lemma': 'annual', 'modifiers': [], 'word': 'annual'}], 'word': 'report'}, {'NE': '', 'POS_coarse': 'VERB', 'POS_fine': 'VB', 'arc': 'ccomp', 'lemma': 'deliver', 'modifiers': [{'NE': '', 'POS_coarse': 'ADV', 'POS_fine': 'WRB', 'arc': 'advmod', 'lemma': 'how', 'modifiers': [], 'word': 'how'}, {'NE': '', 'POS_coarse': 'NOUN', 'POS_fine': 'NNS', 'arc': 'nsubj', 'lemma': 'leader', 'modifiers': [], 'word': 'leaders'}, {'NE': '', 'POS_coarse': 'VERB', 'POS_fine': 'MD', 'arc': 'aux', 'lemma': 'can', 'modifiers': [], 'word': 'can'}, {'NE': '', 'POS_coarse': 'ADP', 'POS_fine': 'IN', 'arc': 'prep', 'lemma': 'on', 'modifiers': [{'NE': '', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'arc': 'pobj', 'lemma': 'promise', 'modifiers': [{'NE': '', 'POS_coarse': 'ADJ', 'POS_fine': 'PRP$', 'arc': 'poss', 'lemma': '-PRON-', 'modifiers': [], 'word': 'their'}], 'word': 'promise'}], 'word': 'on'}, {'NE': '', 'POS_coarse': 'CCONJ', 'POS_fine': 'CC', 'arc': 'cc', 'lemma': 'and', 'modifiers': [], 'word': 'and'}, {'NE': '', 'POS_coarse': 'VERB', 'POS_fine': 'VBZ', 'arc': 'conj', 'lemma': 'urge', 'modifiers': [{'NE': '', 'POS_coarse': 'PRON', 'POS_fine': 'PRP', 'arc': 'nsubj', 'lemma': '-PRON-', 'modifiers': [], 'word': 'it'}, {'NE': '', 'POS_coarse': 'NOUN', 'POS_fine': 'NNS', 'arc': 'dobj', 'lemma': 'country', 'modifiers': [], 'word': 'countries'}, {'NE': '', 'POS_coarse': 'VERB', 'POS_fine': 'VB', 'arc': 'xcomp', 'lemma': 'lose', 'modifiers': [{'NE': '', 'POS_coarse': 'ADV', 'POS_fine': 'RB', 'arc': 'neg', 'lemma': 'not', 'modifiers': [], 'word': 'not'}, {'NE': '', 'POS_coarse': 'PART', 'POS_fine': 'TO', 'arc': 'aux', 'lemma': 'to', 'modifiers': [], 'word': 'to'}, {'NE': '', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'arc': 'dobj', 'lemma': 'momentum', 'modifiers': [{'NE': '', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'arc': 'det', 'lemma': 'the', 'modifiers': [], 'word': 'the'}, {'NE': '', 'POS_coarse': 'ADP', 'POS_fine': 'IN', 'arc': 'prep', 'lemma': 'for', 'modifiers': [{'NE': '', 'POS_coarse': 'NOUN', 'POS_fine': 'NNS', 'arc': 'pobj', 'lemma': 'reform', 'modifiers': [{'NE': '', 'POS_coarse': 'ADJ', 'POS_fine': 'JJ', 'arc': 'amod', 'lemma': 'important', 'modifiers': [], 'word': 'important'}], 'word': 'reforms'}], 'word': 'for'}], 'word': 'momentum'}], 'word': 'lose'}], 'word': 'urges'}], 'word': 'deliver'}, {'NE': '', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'arc': 'punct', 'lemma': '.', 'modifiers': [{'NE': 'GPE', 'POS_coarse': '', 'POS_fine': '', 'arc': '', 'lemma': '\n', 'modifiers': [], 'word': '\n'}], 'word': '.'}], 'word': 'shows'}, {'NE': '', 'POS_coarse': 'VERB', 'POS_fine': 'VBN', 'arc': 'ROOT', 'lemma': 'produce', 'modifiers': [{'NE': '', 'POS_coarse': 'PRON', 'POS_fine': 'PRP', 'arc': 'nsubjpass', 'lemma': '-PRON-', 'modifiers': [], 'word': 'It'}, {'NE': '', 'POS_coarse': 'VERB', 'POS_fine': 'VBZ', 'arc': 'auxpass', 'lemma': 'be', 'modifiers': [], 'word': 'is'}, {'NE': '', 'POS_coarse': 'ADP', 'POS_fine': 'IN', 'arc': 'agent', 'lemma': 'by', 'modifiers': [{'NE': 'ORG', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'arc': 'pobj', 'lemma': 'the', 'modifiers': [{'NE': '', 'POS_coarse': 'PUNCT', 'POS_fine': '-LRB-', 'arc': 'punct', 'lemma': '(', 'modifiers': [], 'word': '('}, {'NE': 'ORG', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'arc': 'appos', 'lemma': 'sdsn', 'modifiers': [], 'word': 'SDSN'}, {'NE': '', 'POS_coarse': 'PUNCT', 'POS_fine': '-RRB-', 'arc': 'punct', 'lemma': ')', 'modifiers': [], 'word': ')'}, {'NE': '', 'POS_coarse': 'PUNCT', 'POS_fine': ',', 'arc': 'punct', 'lemma': ',', 'modifiers': [], 'word': ','}, {'NE': 'ORG', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'arc': 'conj', 'lemma': 'the', 'modifiers': [{'NE': '', 'POS_coarse': 'CCONJ', 'POS_fine': 'CC', 'arc': 'cc', 'lemma': 'and', 'modifiers': [], 'word': 'and'}, {'NE': 'ORG', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'arc': 'conj', 'lemma': 'the', 'modifiers': [], 'word': 'the Bertelsmann Stiftung'}], 'word': 'the Deutsche Bank'}], 'word': 'the Sustainable Development Solutions Network'}], 'word': 'by'}, {'NE': '', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'arc': 'punct', 'lemma': '.', 'modifiers': [{'NE': 'GPE', 'POS_coarse': '', 'POS_fine': '', 'arc': '', 'lemma': '\n', 'modifiers': [], 'word': '\n'}], 'word': '.'}], 'word': 'produced'}]
In [ ]: