{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "title: \"Grewpy • request\"\n", "date: 2024-04-22\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[`grewpy` Tutorial](../tutorial)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Grewpy tutorial: Modify data\n", "\n", "Download the notebook [here](../modify_data.ipynb)." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2024-08-29T14:18:26.696506Z", "iopub.status.busy": "2024-08-29T14:18:26.696095Z", "iopub.status.idle": "2024-08-29T14:18:27.272316Z", "shell.execute_reply": "2024-08-29T14:18:27.271900Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "connected to port: 64114\n" ] } ], "source": [ "import grewpy\n", "from grewpy import Corpus, CorpusDraft, Request\n", "grewpy.set_config(\"sud\") # ud or basic\n", "corpus = Corpus(\"SUD_English-PUD\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Access data in a corpus" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2024-08-29T14:18:27.274489Z", "iopub.status.busy": "2024-08-29T14:18:27.274323Z", "iopub.status.idle": "2024-08-29T14:18:27.277839Z", "shell.execute_reply": "2024-08-29T14:18:27.277549Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "A corpus is a set of graphs: \n" ] } ], "source": [ "# Access to the corpus\n", "sentence = corpus[1]\n", "print(\"A corpus is a set of graphs:\", type(sentence))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2024-08-29T14:18:27.279390Z", "iopub.status.busy": "2024-08-29T14:18:27.279307Z", "iopub.status.idle": "2024-08-29T14:18:27.283772Z", "shell.execute_reply": "2024-08-29T14:18:27.283497Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sentence metadata:\n" ] }, { "data": { "text/plain": [ "{'sent_id': 'n01001013',\n", " 'text': 'For those who follow social media transitions on Capitol Hill, this will be a little different.',\n", " '_filename': 'en_pud-sud-test.conllu'}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Each graph is a sentence and contains all its information\n", "print(\"Sentence metadata:\")\n", "sentence.meta" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2024-08-29T14:18:27.285619Z", "iopub.status.busy": "2024-08-29T14:18:27.285516Z", "iopub.status.idle": "2024-08-29T14:18:27.287216Z", "shell.execute_reply": "2024-08-29T14:18:27.286980Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18']\n" ] } ], "source": [ "# Sentence order, which in this case is the same as the token's id\n", "print(sentence.order)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2024-08-29T14:18:27.288730Z", "iopub.status.busy": "2024-08-29T14:18:27.288623Z", "iopub.status.idle": "2024-08-29T14:18:27.290596Z", "shell.execute_reply": "2024-08-29T14:18:27.290341Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'0': {'form': '__0__'}, '1': {'form': 'For', 'lemma': 'for', 'textform': 'For', 'upos': 'ADP', 'wordform': 'For', 'xpos': 'IN'}, '2': {'Number': 'Plur', 'PronType': 'Dem', 'form': 'those', 'lemma': 'those', 'textform': 'those', 'upos': 'PRON', 'wordform': 'those', 'xpos': 'DT'}, '3': {'PronType': 'Rel', 'form': 'who', 'lemma': 'who', 'textform': 'who', 'upos': 'PRON', 'wordform': 'who', 'xpos': 'WP'}, '4': {'Mood': 'Ind', 'Tense': 'Pres', 'VerbForm': 'Fin', 'form': 'follow', 'lemma': 'follow', 'textform': 'follow', 'upos': 'VERB', 'wordform': 'follow', 'xpos': 'VBP'}, '5': {'Degree': 'Pos', 'form': 'social', 'lemma': 'social', 'textform': 'social', 'upos': 'ADJ', 'wordform': 'social', 'xpos': 'JJ'}, '6': {'Number': 'Sing', 'form': 'media', 'lemma': 'media', 'textform': 'media', 'upos': 'NOUN', 'wordform': 'media', 'xpos': 'NN'}, '7': {'Number': 'Plur', 'form': 'transitions', 'lemma': 'transition', 'textform': 'transitions', 'upos': 'NOUN', 'wordform': 'transitions', 'xpos': 'NNS'}, '8': {'form': 'on', 'lemma': 'on', 'textform': 'on', 'upos': 'ADP', 'wordform': 'on', 'xpos': 'IN'}, '9': {'Number': 'Sing', 'form': 'Capitol', 'lemma': 'Capitol', 'textform': 'Capitol', 'upos': 'PROPN', 'wordform': 'Capitol', 'xpos': 'NNP'}, '10': {'Number': 'Sing', 'SpaceAfter': 'No', 'form': 'Hill', 'lemma': 'Hill', 'textform': 'Hill', 'upos': 'PROPN', 'wordform': 'Hill', 'xpos': 'NNP'}, '11': {'form': ',', 'lemma': ',', 'textform': ',', 'upos': 'PUNCT', 'wordform': ',', 'xpos': ','}, '12': {'Number': 'Sing', 'PronType': 'Dem', 'form': 'this', 'lemma': 'this', 'textform': 'this', 'upos': 'PRON', 'wordform': 'this', 'xpos': 'DT'}, '13': {'VerbForm': 'Fin', 'form': 'will', 'lemma': 'will', 'textform': 'will', 'upos': 'AUX', 'wordform': 'will', 'xpos': 'MD'}, '14': {'VerbForm': 'Inf', 'form': 'be', 'lemma': 'be', 'textform': 'be', 'upos': 'AUX', 'wordform': 'be', 'xpos': 'VB'}, '15': {'Definite': 'Ind', 'PronType': 'Art', 'form': 'a', 'lemma': 'a', 'textform': 'a', 'upos': 'DET', 'wordform': 'a', 'xpos': 'DT'}, '16': {'Degree': 'Pos', 'form': 'little', 'lemma': 'little', 'textform': 'little', 'upos': 'ADJ', 'wordform': 'little', 'xpos': 'JJ'}, '17': {'Degree': 'Pos', 'SpaceAfter': 'No', 'form': 'different', 'lemma': 'different', 'textform': 'different', 'upos': 'ADJ', 'wordform': 'different', 'xpos': 'JJ'}, '18': {'form': '.', 'lemma': '.', 'textform': '.', 'upos': 'PUNCT', 'wordform': '.', 'xpos': '.'}}\n", "['ADP', 'PRON', 'PRON', 'VERB', 'ADJ', 'NOUN', 'NOUN', 'ADP', 'PROPN', 'PROPN', 'PUNCT', 'PRON', 'AUX', 'AUX', 'DET', 'ADJ', 'ADJ', 'PUNCT']\n" ] } ], "source": [ "# Token features, which make possible to access every token feature\n", "print(sentence.features)\n", "\n", "# e.g get all upos of the sentence\n", "print([sentence.features[id]['upos'] for id in sentence.features if id != \"0\"])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2024-08-29T14:18:27.292136Z", "iopub.status.busy": "2024-08-29T14:18:27.292034Z", "iopub.status.idle": "2024-08-29T14:18:27.293707Z", "shell.execute_reply": "2024-08-29T14:18:27.293449Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'17': [('16', Fs_edge({'1': 'udep', 'deep': 'npmod'}))], '16': [('15', Fs_edge({'1': 'det'}))], '14': [('17', Fs_edge({'1': 'comp', '2': 'pred'}))], '13': [('18', Fs_edge({'1': 'punct'})), ('14', Fs_edge({'1': 'comp', '2': 'aux'})), ('12', Fs_edge({'1': 'subj'})), ('11', Fs_edge({'1': 'punct'})), ('1', Fs_edge({'1': 'udep'}))], '10': [('9', Fs_edge({'1': 'compound'}))], '8': [('10', Fs_edge({'1': 'comp', '2': 'obj'}))], '7': [('8', Fs_edge({'1': 'udep'})), ('6', Fs_edge({'1': 'compound'}))], '6': [('5', Fs_edge({'1': 'mod'}))], '4': [('7', Fs_edge({'1': 'comp', '2': 'obj'})), ('3', Fs_edge({'1': 'subj'}))], '2': [('4', Fs_edge({'1': 'mod', 'deep': 'relcl'}))], '1': [('2', Fs_edge({'1': 'comp', '2': 'obj'}))], '0': [('13', Fs_edge({'1': 'root'}))]}\n" ] } ], "source": [ "# It's possible to access to edges between nodes as successors\n", "print(sentence.sucs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Modifying a corpus\n", "`Corpus` is an abstract object which cannot be modified directly:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2024-08-29T14:18:27.295314Z", "iopub.status.busy": "2024-08-29T14:18:27.295216Z", "iopub.status.idle": "2024-08-29T14:18:27.297978Z", "shell.execute_reply": "2024-08-29T14:18:27.297725Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'Corpus' object does not support item assignment\n" ] } ], "source": [ "try:\n", "\tcorpus[0] = corpus[1]\n", "except TypeError as error_message:\n", "\tprint (f\"{error_message}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`CorpusDraft` is an object similar to `Corpus` but which is mutable.\n", "Below, we add the feature `Transitive=Yes` to all occurrences of verbs with a direct object.\n", "\n", "1. We make the search on `corpus` (an instance of `Corpus`).\n", "2. The modification is done on a `CorpusDraft` counterpart named `draft`.\n", "3. The `draft` should be transformed again into a `Corpus` (names `corpus2` below) in order to use the `count` method." ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2024-08-29T14:18:27.299551Z", "iopub.status.busy": "2024-08-29T14:18:27.299448Z", "iopub.status.idle": "2024-08-29T14:18:27.747712Z", "shell.execute_reply": "2024-08-29T14:18:27.747423Z" } }, "outputs": [ { "data": { "text/plain": [ "853" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# step 1\n", "req7 = Request().pattern(\"X[upos=VERB]; Y[upos=NOUN|PROPN|PRON]; X-[comp:obj]->Y\")\n", "occurrences = corpus.search(req7)\n", "\n", "# step 2\n", "draft = CorpusDraft(corpus)\n", "for occ in occurrences:\n", " sent_id = occ['sent_id']\n", " verb_node_id = occ['matching']['nodes']['X']\n", " draft[sent_id][verb_node_id].update({\"Transitive\": \"Yes\"})\n", "\n", "# step 3\n", "corpus2 = Corpus(draft)\n", "corpus2.count(Request(\"pattern { X[Transitive=Yes] }\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It's possible to modify a whole `CorpusDraft` with a function getting a graph as input." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2024-08-29T14:18:27.749585Z", "iopub.status.busy": "2024-08-29T14:18:27.749465Z", "iopub.status.idle": "2024-08-29T14:18:28.040693Z", "shell.execute_reply": "2024-08-29T14:18:28.040432Z" } }, "outputs": [ { "data": { "text/plain": [ "4036" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def relabel_noun(graph):\n", " for node in graph:\n", " if 'upos' in graph[node] and graph[node]['upos'] == 'NOUN':\n", " graph[node]['upos'] = 'N'\n", " return graph\n", "\n", "draft3 = draft.apply(relabel_noun)\n", "\n", "# Again, we need to turn the result into a `Corpus` before using the `count` method.\n", "corpus3 = Corpus(draft3)\n", "corpus3.count(Request(\"pattern { X[upos=N] }\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Modifying a corpus using a GRS (Graph Rewriting System)\n", "In many cases, it is not required to uses a `CorpusDraft` and the modification of a corpus can be encoded with graph rewriting rules.\n", "\n", "The example above (identifying transitive verbs) can be rephrased as below.\n", "See TODO link for an explanation of the `without` clause in this example." ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2024-08-29T14:18:28.042434Z", "iopub.status.busy": "2024-08-29T14:18:28.042332Z", "iopub.status.idle": "2024-08-29T14:18:28.070006Z", "shell.execute_reply": "2024-08-29T14:18:28.069755Z" } }, "outputs": [ { "data": { "text/plain": [ "853" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from grewpy import GRS\n", "\n", "s = \"\"\"\n", "strat main { Onf(tv) }\n", "\n", "rule tv {\n", " pattern { X[upos=VERB]; Y[upos=NOUN|PROPN|PRON]; X-[comp:obj]->Y }\n", " without { X[Transitive = Yes] }\n", " commands { X.Transitive = Yes }\n", "}\n", "\"\"\"\n", "grs = GRS(s)\n", "corpus2bis = grs.apply(corpus)\n", "corpus2bis.count(Request(\"pattern { X[Transitive=Yes] }\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For the example, where the upos tag `NOUN` is changed to `N`, this can be done with a GRS:" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2024-08-29T14:18:28.071870Z", "iopub.status.busy": "2024-08-29T14:18:28.071764Z", "iopub.status.idle": "2024-08-29T14:18:28.121899Z", "shell.execute_reply": "2024-08-29T14:18:28.121600Z" } }, "outputs": [ { "data": { "text/plain": [ "4036" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from grewpy import GRS\n", "\n", "grs3 = GRS(\"\"\"\n", "strat main { Onf(noun2n) }\n", "\n", "rule noun2n {\n", " pattern { X[upos=NOUN] }\n", " commands { X.upos = N }\n", "}\n", "\"\"\")\n", "corpus3bis = grs3.apply(corpus)\n", "corpus3bis.count(Request(\"pattern { X[upos=N] }\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Similarily to the `CorpusDraft` above, there is a mmodule `GRSDraft` which can be inspected and which is mutable." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2024-08-29T14:18:28.123616Z", "iopub.status.busy": "2024-08-29T14:18:28.123513Z", "iopub.status.idle": "2024-08-29T14:18:28.126575Z", "shell.execute_reply": "2024-08-29T14:18:28.126310Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "rule='existential'\n" ] } ], "source": [ "from grewpy import GRSDraft\n", "\n", "s = \"\"\"\n", "strat main {Onf(cxns)}\n", "package cxns {\n", " rule existential {\n", " pattern {X-[comp@expl]->Y; X[lemma=be]}\n", " without {X[Cxn=Existential]}\n", " commands {X.Cxn=Existential}\n", " }\n", "}\n", "\"\"\"\n", "\n", "grs_draft = GRSDraft(s)\n", "\n", "for rule in grs_draft['cxns'].rules():\n", " print(f\"{rule=}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A `GRSDraft` cannot be applied to a corpus, it should be turned into a `GRS`:" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2024-08-29T14:18:28.128207Z", "iopub.status.busy": "2024-08-29T14:18:28.128123Z", "iopub.status.idle": "2024-08-29T14:18:28.145557Z", "shell.execute_reply": "2024-08-29T14:18:28.145276Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "n_existentials=26\n" ] } ], "source": [ "grs = GRS(grs_draft)\n", "corpus.apply(grs)\n", "n_existentials = corpus.count(Request(\"pattern { X[Cxn=Existential] }\"))\n", "print(f\"{n_existentials=}\")" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.5" } }, "nbformat": 4, "nbformat_minor": 2 }