{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "title: \"Grewpy • request\"\n",
    "date: 2024-04-22\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[`grewpy` Tutorial](../tutorial)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Grewpy tutorial: Modify data\n",
    "\n",
    "Download the notebook [here](../modify_data.ipynb)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-07-03T15:07:45.571072Z",
     "iopub.status.busy": "2025-07-03T15:07:45.570832Z",
     "iopub.status.idle": "2025-07-03T15:07:46.559719Z",
     "shell.execute_reply": "2025-07-03T15:07:46.558738Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "connected to port: 50163\n"
     ]
    }
   ],
   "source": [
    "import grewpy\n",
    "from grewpy import Corpus, CorpusDraft, Request\n",
    "grewpy.set_config(\"sud\") # ud or basic\n",
    "corpus = Corpus(\"SUD_English-PUD\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Access data in a corpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-07-03T15:07:46.565222Z",
     "iopub.status.busy": "2025-07-03T15:07:46.564842Z",
     "iopub.status.idle": "2025-07-03T15:07:46.570513Z",
     "shell.execute_reply": "2025-07-03T15:07:46.569784Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A corpus is a set of graphs: <class 'grewpy.graph.Graph'>\n"
     ]
    }
   ],
   "source": [
    "# Access to the corpus\n",
    "sentence = corpus[1]\n",
    "print(\"A corpus is a set of graphs:\", type(sentence))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-07-03T15:07:46.573343Z",
     "iopub.status.busy": "2025-07-03T15:07:46.572824Z",
     "iopub.status.idle": "2025-07-03T15:07:46.580875Z",
     "shell.execute_reply": "2025-07-03T15:07:46.580176Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentence metadata:\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'sent_id': 'n01001013',\n",
       " 'text': 'For those who follow social media transitions on Capitol Hill, this will be a little different.',\n",
       " '_filename': 'en_pud-sud-test.conllu'}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Each graph is a sentence and contains all its information\n",
    "print(\"Sentence metadata:\")\n",
    "sentence.meta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-07-03T15:07:46.584108Z",
     "iopub.status.busy": "2025-07-03T15:07:46.583861Z",
     "iopub.status.idle": "2025-07-03T15:07:46.588153Z",
     "shell.execute_reply": "2025-07-03T15:07:46.587413Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18']\n"
     ]
    }
   ],
   "source": [
    "# Sentence order, which in this case is the same as the token's id\n",
    "print(sentence.order)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-07-03T15:07:46.590971Z",
     "iopub.status.busy": "2025-07-03T15:07:46.590721Z",
     "iopub.status.idle": "2025-07-03T15:07:46.596834Z",
     "shell.execute_reply": "2025-07-03T15:07:46.595897Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'0': {'form': '__0__'}, '1': {'form': 'For', 'lemma': 'for', 'textform': 'For', 'upos': 'ADP', 'wordform': 'For', 'xpos': 'IN'}, '2': {'Number': 'Plur', 'PronType': 'Dem', 'form': 'those', 'lemma': 'that', 'textform': 'those', 'upos': 'PRON', 'wordform': 'those', 'xpos': 'DT'}, '3': {'PronType': 'Rel', 'form': 'who', 'lemma': 'who', 'textform': 'who', 'upos': 'PRON', 'wordform': 'who', 'xpos': 'WP'}, '4': {'Mood': 'Ind', 'Tense': 'Pres', 'VerbForm': 'Fin', 'form': 'follow', 'lemma': 'follow', 'textform': 'follow', 'upos': 'VERB', 'wordform': 'follow', 'xpos': 'VBP'}, '5': {'Degree': 'Pos', 'form': 'social', 'lemma': 'social', 'textform': 'social', 'upos': 'ADJ', 'wordform': 'social', 'xpos': 'JJ'}, '6': {'Number': 'Sing', 'form': 'media', 'lemma': 'media', 'textform': 'media', 'upos': 'NOUN', 'wordform': 'media', 'xpos': 'NN'}, '7': {'Number': 'Plur', 'form': 'transitions', 'lemma': 'transition', 'textform': 'transitions', 'upos': 'NOUN', 'wordform': 'transitions', 'xpos': 'NNS'}, '8': {'form': 'on', 'lemma': 'on', 'textform': 'on', 'upos': 'ADP', 'wordform': 'on', 'xpos': 'IN'}, '9': {'Number': 'Sing', 'form': 'Capitol', 'lemma': 'Capitol', 'textform': 'Capitol', 'upos': 'PROPN', 'wordform': 'Capitol', 'xpos': 'NNP'}, '10': {'Number': 'Sing', 'SpaceAfter': 'No', 'form': 'Hill', 'lemma': 'Hill', 'textform': 'Hill', 'upos': 'PROPN', 'wordform': 'Hill', 'xpos': 'NNP'}, '11': {'form': ',', 'lemma': ',', 'textform': ',', 'upos': 'PUNCT', 'wordform': ',', 'xpos': ','}, '12': {'Number': 'Sing', 'PronType': 'Dem', 'form': 'this', 'lemma': 'this', 'textform': 'this', 'upos': 'PRON', 'wordform': 'this', 'xpos': 'DT'}, '13': {'VerbForm': 'Fin', 'form': 'will', 'lemma': 'will', 'textform': 'will', 'upos': 'AUX', 'wordform': 'will', 'xpos': 'MD'}, '14': {'VerbForm': 'Inf', 'form': 'be', 'lemma': 'be', 'textform': 'be', 'upos': 'AUX', 'wordform': 'be', 'xpos': 'VB'}, '15': {'Definite': 'Ind', 'PronType': 'Art', 'form': 'a', 'lemma': 'a', 'textform': 'a', 'upos': 'DET', 'wordform': 'a', 'xpos': 'DT'}, '16': {'Degree': 'Pos', 'form': 'little', 'lemma': 'little', 'textform': 'little', 'upos': 'ADJ', 'wordform': 'little', 'xpos': 'JJ'}, '17': {'Degree': 'Pos', 'SpaceAfter': 'No', 'form': 'different', 'lemma': 'different', 'textform': 'different', 'upos': 'ADJ', 'wordform': 'different', 'xpos': 'JJ'}, '18': {'form': '.', 'lemma': '.', 'textform': '.', 'upos': 'PUNCT', 'wordform': '.', 'xpos': '.'}}\n",
      "['ADP', 'PRON', 'PRON', 'VERB', 'ADJ', 'NOUN', 'NOUN', 'ADP', 'PROPN', 'PROPN', 'PUNCT', 'PRON', 'AUX', 'AUX', 'DET', 'ADJ', 'ADJ', 'PUNCT']\n"
     ]
    }
   ],
   "source": [
    "# Token features, which make possible to access every token feature\n",
    "print(sentence.features)\n",
    "\n",
    "# e.g get all upos of the sentence\n",
    "print([sentence.features[id]['upos'] for id in sentence.features if id != \"0\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-07-03T15:07:46.599481Z",
     "iopub.status.busy": "2025-07-03T15:07:46.599248Z",
     "iopub.status.idle": "2025-07-03T15:07:46.604421Z",
     "shell.execute_reply": "2025-07-03T15:07:46.603622Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'17': [('16', FsEdge({'1': 'udep', 'deep': 'unmarked'}))], '16': [('15', FsEdge({'1': 'det'}))], '14': [('17', FsEdge({'1': 'comp', '2': 'pred'}))], '13': [('18', FsEdge({'1': 'punct'})), ('14', FsEdge({'1': 'comp', '2': 'aux'})), ('12', FsEdge({'1': 'subj'})), ('11', FsEdge({'1': 'punct'})), ('1', FsEdge({'1': 'udep'}))], '10': [('9', FsEdge({'1': 'compound'}))], '8': [('10', FsEdge({'1': 'comp', '2': 'obj'}))], '7': [('8', FsEdge({'1': 'udep'})), ('6', FsEdge({'1': 'compound'}))], '6': [('5', FsEdge({'1': 'mod'}))], '4': [('7', FsEdge({'1': 'comp', '2': 'obj'})), ('3', FsEdge({'1': 'subj'}))], '2': [('4', FsEdge({'1': 'mod', 'deep': 'relcl'}))], '1': [('2', FsEdge({'1': 'comp', '2': 'obj'}))], '0': [('13', FsEdge({'1': 'root'}))]}\n"
     ]
    }
   ],
   "source": [
    "# It's possible to access to edges between nodes as successors\n",
    "print(sentence.sucs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Modifying a corpus\n",
    "`Corpus` is an abstract object which cannot be modified directly:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-07-03T15:07:46.607172Z",
     "iopub.status.busy": "2025-07-03T15:07:46.606920Z",
     "iopub.status.idle": "2025-07-03T15:07:46.613710Z",
     "shell.execute_reply": "2025-07-03T15:07:46.613022Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'Corpus' object does not support item assignment\n"
     ]
    }
   ],
   "source": [
    "try:\n",
    "\tcorpus[0] = corpus[1]\n",
    "except TypeError as error_message:\n",
    "\tprint (f\"{error_message}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`CorpusDraft` is an object similar to `Corpus` but which is mutable.\n",
    "Below, we add the feature `Transitive=Yes` to all occurrences of verbs with a direct object.\n",
    "\n",
    "1. We make the search on `corpus` (an instance of `Corpus`).\n",
    "2. The modification is done on a `CorpusDraft` counterpart named `draft`.\n",
    "3. The `draft` should be transformed again into a `Corpus` (names `corpus2` below) in order to use the `count` method."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-07-03T15:07:46.616432Z",
     "iopub.status.busy": "2025-07-03T15:07:46.616180Z",
     "iopub.status.idle": "2025-07-03T15:07:47.700202Z",
     "shell.execute_reply": "2025-07-03T15:07:47.698602Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "853"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# step 1\n",
    "req7 = Request().pattern(\"X[upos=VERB]; Y[upos=NOUN|PROPN|PRON]; X-[comp:obj]->Y\")\n",
    "occurrences = corpus.search(req7)\n",
    "\n",
    "# step 2\n",
    "draft = CorpusDraft(corpus)\n",
    "for occ in occurrences:\n",
    "    sent_id = occ['sent_id']\n",
    "    verb_node_id = occ['matching']['nodes']['X']\n",
    "    draft[sent_id][verb_node_id].update({\"Transitive\": \"Yes\"})\n",
    "\n",
    "# step 3\n",
    "corpus2 = Corpus(draft)\n",
    "corpus2.count(Request(\"pattern { X[Transitive=Yes] }\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It's possible to modify a whole `CorpusDraft` with a function getting a graph as input."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-07-03T15:07:47.703620Z",
     "iopub.status.busy": "2025-07-03T15:07:47.703016Z",
     "iopub.status.idle": "2025-07-03T15:07:48.409788Z",
     "shell.execute_reply": "2025-07-03T15:07:48.409096Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4019"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def relabel_noun(graph):\n",
    "    for node in graph:\n",
    "        if 'upos' in graph[node] and graph[node]['upos'] == 'NOUN':\n",
    "            graph[node]['upos'] = 'N'\n",
    "    return graph\n",
    "\n",
    "draft3 = draft.map(relabel_noun)\n",
    "# Note that the map function has replaced the apply function which is deprecated in 0.6\n",
    "\n",
    "\n",
    "# Again, we need to turn the result into a `Corpus` before using the `count` method.\n",
    "corpus3 = Corpus(draft3)\n",
    "corpus3.count(Request(\"pattern { X[upos=N] }\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Modifying a corpus using a GRS (Graph Rewriting System)\n",
    "In many cases, it is not required to uses a `CorpusDraft` and the modification of a corpus can be encoded with graph rewriting rules.\n",
    "\n",
    "The example above (identifying transitive verbs) can be rephrased as below.\n",
    "See TODO link for an explanation of the `without` clause in this example."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-07-03T15:07:48.412418Z",
     "iopub.status.busy": "2025-07-03T15:07:48.412151Z",
     "iopub.status.idle": "2025-07-03T15:07:48.473112Z",
     "shell.execute_reply": "2025-07-03T15:07:48.471850Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "853"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from grewpy import GRS\n",
    "\n",
    "s = \"\"\"\n",
    "strat main { Onf(tv) }\n",
    "\n",
    "rule tv {\n",
    "  pattern { X[upos=VERB]; Y[upos=NOUN|PROPN|PRON]; X-[comp:obj]->Y }\n",
    "  without { X[Transitive = Yes] }\n",
    "  commands { X.Transitive = Yes }\n",
    "}\n",
    "\"\"\"\n",
    "grs = GRS(s)\n",
    "corpus2bis = grs.apply(corpus)\n",
    "corpus2bis.count(Request(\"pattern { X[Transitive=Yes] }\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For the example, where the upos tag `NOUN` is changed to `N`, this can be done with a GRS:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-07-03T15:07:48.477071Z",
     "iopub.status.busy": "2025-07-03T15:07:48.476406Z",
     "iopub.status.idle": "2025-07-03T15:07:48.583545Z",
     "shell.execute_reply": "2025-07-03T15:07:48.581850Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4019"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from grewpy import GRS\n",
    "\n",
    "grs3 = GRS(\"\"\"\n",
    "strat main { Onf(noun2n) }\n",
    "\n",
    "rule noun2n {\n",
    "  pattern { X[upos=NOUN] }\n",
    "  commands { X.upos = N }\n",
    "}\n",
    "\"\"\")\n",
    "corpus3bis = grs3.apply(corpus)\n",
    "corpus3bis.count(Request(\"pattern { X[upos=N] }\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Similarily to the `CorpusDraft` above, there is a module `GRSDraft` which can be inspected and which is mutable."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-07-03T15:07:48.588563Z",
     "iopub.status.busy": "2025-07-03T15:07:48.587955Z",
     "iopub.status.idle": "2025-07-03T15:07:48.597319Z",
     "shell.execute_reply": "2025-07-03T15:07:48.596319Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rule='existential'\n"
     ]
    }
   ],
   "source": [
    "from grewpy import GRSDraft\n",
    "\n",
    "s = \"\"\"\n",
    "strat main {Onf(cxns)}\n",
    "package cxns {\n",
    "    rule existential {\n",
    "        pattern {X-[comp@expl]->Y; X[lemma=be]}\n",
    "        without {X[Cxn=Existential]}\n",
    "        commands {X.Cxn=Existential}\n",
    "    }\n",
    "}\n",
    "\"\"\"\n",
    "\n",
    "grs_draft = GRSDraft(s)\n",
    "\n",
    "for rule in grs_draft['cxns'].rules():\n",
    "    print(f\"{rule=}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A `GRSDraft` cannot be applied to a corpus, it should be turned into a `GRS`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-07-03T15:07:48.600484Z",
     "iopub.status.busy": "2025-07-03T15:07:48.599679Z",
     "iopub.status.idle": "2025-07-03T15:07:48.643185Z",
     "shell.execute_reply": "2025-07-03T15:07:48.641886Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "n_existentials=27\n"
     ]
    }
   ],
   "source": [
    "grs = GRS(grs_draft)\n",
    "corpus.apply(grs)\n",
    "n_existentials = corpus.count(Request(\"pattern { X[Cxn=Existential] }\"))\n",
    "print(f\"{n_existentials=}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}