Source code for grewpy.corpus

"""
Grew module : anything you want to talk about graphs
Graphs are represented either by a dict (called dict-graph),
or by an str (str-graph).
"""
import os.path
import glob

import sys
import tempfile
import json
import typing
import numpy as np

from .network import send_and_receive
from .graph import Graph
from .grew import GrewError
from .observation import Observation

from . import network

from .matchings import Matchings

[docs]class AbstractCorpus():
[docs] def edge_diff(self, other, edge_criterion=lambda e: True): """ given two corpora, outputs the number of common edges, only left ones and only right ones. It also outputs precision, recall and f-measure. """ (common, left, right) = np.sum( [self[sid].edge_diff(other[sid],edge_criterion) for sid in self], axis=0) precision = common / (common + left+1e-10) recall = common / (common + right+1e-10) f_measure = 2*precision*recall / (precision+recall+1e-10) return { "common": common, "left": left, "right": right, "precision": round(precision, 3), "recall": round(recall, 3), "f_measure": round(f_measure, 3), }
[docs] def edge_diff_up_to(self, other, edge_transform=lambda e: e): (common, left, right) = np.sum( [self[sid].edge_diff_up_to(other[sid], edge_transform) for sid in self], axis=0) precision = common / (common + left) recall = common / (common + right) f_measure = 2*precision*recall / (precision+recall) return { "common": common, "left": left, "right": right, "precision": round(precision, 3), "recall": round(recall, 3), "f_measure": round(f_measure, 3), }
[docs]class CorpusDraft(AbstractCorpus,dict): """ the draft is composed of - self, a dict mapping sentence_id to graphs - self._sent_ids, a list that specifies the sentence order """ def __init__(self,data=None): """Load a corpus from a file of a string :param data: a file, a list of files or a CoNLL string representation of a corpus :return: an integer index for latter reference to the corpus :raise an error if the files was not correctly loaded """ if isinstance(data, CorpusDraft): T = {sid: Graph(data[sid]) for sid in data} super().__init__(T) elif isinstance(data, dict): super().__init__(data) elif data == None: super().__init__() else: acorpus = data if isinstance(data, Corpus) else Corpus(data) self._sent_ids = acorpus.get_sent_ids() #specifies the sentences order super().__init__(acorpus.get_all()) def __getitem__(self, data): """ Search for [data] in previously loaded corpus :param data: a sent_id (type string) or a position (type int) :param corpus_index: an integer given by the [corpus] function :return: a graph """ if isinstance(data, str): return super().__getitem__(data) if isinstance(data, int): return self[self._sent_ids[data]] if isinstance(data, slice): return [self[sid] for sid in self._sent_ids[data]]
[docs] def apply(self, fun): """ Apply fun to all graphs, return the new Corpus """ return CorpusDraft({sid : fun(self[sid]) for sid in self})
[docs] def to_conll(self): return Corpus(self).to_conll()
[docs]class Corpus(AbstractCorpus): def __init__(self, data): """An abstract corpus :param data: a file, a list of files or a CoNLL string representation of a corpus :return: an integer index for latter reference to the corpus :raise an error if the files was not correctly loaded """ if isinstance(data, list): if data and isinstance(data[0], Graph): graphs = {f'{i}' : data[i].json_data() for i in range(len(data))} req = {"command": "corpus_from_dict", "graphs": graphs} else: #supposed to be a list of files req = {"command": "corpus_load", "files": data} reply = network.send_and_receive(req) elif isinstance(data, dict): req = {"command": "corpus_from_dict", "graphs": { sent_id: graph.json_data() for (sent_id, graph) in data.items()}} reply = network.send_and_receive(req) elif os.path.isdir(data): # load of connlu files of the directory file_list = glob.glob(f"{data}/*.conllu") + glob.glob(f"{data}/*.conll") + glob.glob(f"{data}/*.cupt") req = {"command": "corpus_load", "files": file_list} reply = network.send_and_receive(req) elif os.path.isfile(data): req = {"command": "corpus_load", "files": [data]} reply = network.send_and_receive(req) else: with tempfile.NamedTemporaryFile(mode="w", delete=True, suffix=".conll") as f: f.write(data) f.flush() # to be read by others req = {"command": "corpus_load", "files": [f.name]} try: reply = network.send_and_receive(req) except GrewError: raise GrewError(data) self._length = reply["length"] self._id = reply["index"]
[docs] def get_sent_ids(self): """ return the list of sentence ids """ req = {"command": "corpus_sent_ids", "corpus_index": self._id} return network.send_and_receive(req)
[docs] def get_id(self): """ return the id of the corpus """ return self._id
[docs] def clean(self): """ clean the corpus (remove from the backend memory) """ req = {"command": "corpus_clean", "corpus_index": self._id} return network.send_and_receive(req)
[docs] def get(self, sent_id): """ return a graph corresponding to the sentence id sent_id """ req = {"command": "corpus_get", "corpus_index": self._id, "sent_id": sent_id} return (Graph.from_json(network.send_and_receive(req)))
def __getitem__(self, data): """ return a graph corresponding to data, either - a sentence id, - an index in the sentence id array - a slice """ if isinstance(data, str): return self.get(data) if isinstance(data, int): sids = self.get_sent_ids() return self.get(sids[data]) if isinstance(data, slice): sids = self.get_sent_ids() return [self[sid] for sid in sids[data]]
[docs] def get_all(self): """ return a dictionary mapping sentence ids to graphs """ dico = network.send_and_receive({"command": "corpus_get_all", "corpus_index": self._id}) return {sid: Graph.from_json(json_data) for (sid,json_data) in dico.items() }
[docs] def search(self, request, clustering_parameter=[], clustering_keys=[], flat=None, deco=False, bound=None, timeout=None): """ Search for [request] into [corpus_index] Parameters: request (Request): a request corpus_index: an integer given by the [corpus] function Returns: list: the list of matching of [request] into the corpus """ res = network.send_and_receive({ "command": "corpus_search", "corpus_index": self._id, "request": request.json_data(), "clustering_keys": clustering_parameter + clustering_keys, "build_deco": deco, "bound": bound, "timeout": timeout, }) if flat == "matchings": return Matchings(res, self) elif flat == "observations" and clustering_parameter or clustering_keys: return Observation(res, clustering_parameter, clustering_keys) return res
[docs] def count(self, request, clustering_parameter=[], clustering_keys=[], flat=False): """ Count for [request] into [corpus_index] :param request: a string request :param corpus_index: an integer given by the [corpus] function :return: the number of matching of [request] into the corpus """ res = network.send_and_receive({ "command": "corpus_count", "corpus_index": self._id, "request": request.json_data(), "clustering_keys": clustering_parameter + clustering_keys, }) if not flat: return res if clustering_parameter or clustering_keys: return Observation(obs=res,parameter=clustering_parameter, keys=clustering_keys) return res
def __len__(self): return self._length def __iter__(self): return iter(self.get_sent_ids())
[docs] def to_conll(self): """ return a CoNLL string for the current corpus """ reply = network.send_and_receive({ "command": "corpus_to_conll", "corpus_index": self._id }) return reply
[docs] def count_feature_values(self, include=None, exclude=["xpos","wordform","textform","SpaceAfter"]): """ return a dict with feature names as key and subdict a value for each feature name, the subdict maps existing feature values with the number of occurrences if include is set as a list of string, only feature names in the list are taken into account else exclude defines the list of feature names in the list whihc are not taken into account """ reply = network.send_and_receive({ "command": "corpus_count_feature_values", "corpus_index": self._id, "include": include, "exclude": exclude, }) return reply
[docs] def run(self, Grs, strat="main"): return Grs.run(self, strat)
[docs] def apply(self, Grs, strat="main"): return Grs.apply(self, strat)