Source code for grewpy.corpus

"""
Grew module : anything you want to talk about graphs
Graphs are represented either by a dict (called dict-graph),
or by an str (str-graph).
"""
import os.path
import glob

import warnings

import sys
import tempfile
import json
import typing
import numpy as np

from .network import send_and_receive
from .graph import Graph
from .grew import GrewError
from .observation import Observation

from . import network

from .matchings import Matchings


[docs]
class AbstractCorpus():

[docs]
    def edge_diff(self, other, edge_criterion=lambda e: True):
        """
        given two corpora, outputs the number of common edges, only left ones and only right ones.
        It also outputs precision, recall and f-measure.
        """
        (common, left, right) = np.sum(
            [self[sid].edge_diff(other[sid],edge_criterion) for sid in self], axis=0)
        precision = common / (common + left+1e-10)
        recall = common / (common + right+1e-10)
        f_measure = 2*precision*recall / (precision+recall+1e-10)
        return {
            "common": common,
            "left": left,
            "right": right,
            "precision": round(precision, 3),
            "recall": round(recall, 3),
            "f_measure": round(f_measure, 3),
        }



[docs]
    def edge_diff_up_to(self, other, edge_transform=lambda e: e):
        (common, left, right) = np.sum(
            [self[sid].edge_diff_up_to(other[sid], edge_transform) for sid in self], axis=0)
        precision = common / (common + left)
        recall = common / (common + right)
        f_measure = 2*precision*recall / (precision+recall)
        return {
            "common": common,
            "left": left,
            "right": right,
            "precision": round(precision, 3),
            "recall": round(recall, 3),
            "f_measure": round(f_measure, 3),
        }




[docs]
class CorpusDraft(AbstractCorpus,dict):
    """
    the draft is composed of
      - self, a dict mapping sentence_id to graphs
      - self._sent_ids, a list that specifies the sentence order
    """
    def __init__(self,data=None):
        """Load a corpus from a file of a string
        :param data: a file, a list of files or a CoNLL string representation of a corpus
        :return: an integer index for latter reference to the corpus
        :raise an error if the files was not correctly loaded
        """
        if isinstance(data, CorpusDraft):
            T = {sid: Graph(data[sid]) for sid in data}
            super().__init__(T)
        elif isinstance(data, dict):
            super().__init__(data)
        elif data == None:
            super().__init__()
        else:
            acorpus = data if isinstance(data, Corpus) else Corpus(data)
            self._sent_ids = acorpus.get_sent_ids() #specifies the sentences order
            super().__init__(acorpus.get_all())

    def __getitem__(self, data):
        """
        Search for [data] in previously loaded corpus
        :param data: a sent_id (type string) or a position (type int)
        :param corpus_index: an integer given by the [corpus] function
        :return: a graph
        """
        if isinstance(data, str):
            return super().__getitem__(data)
        if isinstance(data, int):
            return self[self._sent_ids[data]]
        if isinstance(data, slice):
            return [self[sid] for sid in self._sent_ids[data]]


[docs]
    def map(self, fun, in_place = False):
        """
        Apply fun to all graphs.
        If in_place is False, it returns a new DraftCorpus.
        If in_place is True, the input corpus is modified, None is returned
        """
        if in_place:
            for sid in self:
                self[sid] = fun (self[sid])
        else:
            return CorpusDraft({sid : fun(self[sid]) for sid in self})



[docs]
    def apply(self,fun):
        warnings.warn(
            "apply() is deprecated and will be removed in a future version."
            "Please use map() instead.",
            DeprecationWarning,
            stacklevel=2
        )
        return self.map(fun)



[docs]
    def to_conll(self):
        return Corpus(self).to_conll()




[docs]
class Corpus(AbstractCorpus):
    def __init__(self, data):
        """An abstract corpus
        :param data: a file, a list of files or a CoNLL string representation of a corpus
        :return: an integer index for latter reference to the corpus
        :raise an error if the files was not correctly loaded
        """
        if isinstance(data, list):
            if data and isinstance(data[0], Graph):
                graphs = {f'{i}' : data[i].json_data() for i in range(len(data))}
                req = {"command": "corpus_from_dict", "graphs": graphs}
            else:
                #supposed to be a list of files
                req = {"command": "corpus_load", "files": data}
            reply = network.send_and_receive(req)
        elif isinstance(data, dict):
            req = {"command": "corpus_from_dict", "graphs": {
                sent_id: graph.json_data() for (sent_id, graph) in data.items()}}
            reply = network.send_and_receive(req)
        elif os.path.isdir(data):
            # load of connlu files of the directory
            file_list = glob.glob(f"{data}/*.conllu") + glob.glob(f"{data}/*.conll") + glob.glob(f"{data}/*.cupt")
            req = {"command": "corpus_load", "files": file_list}
            reply = network.send_and_receive(req)
        elif os.path.isfile(data):
            req = {"command": "corpus_load", "files": [data]}
            reply = network.send_and_receive(req)
        else:
            with tempfile.NamedTemporaryFile(mode="w", delete=True, suffix=".conll") as f:
                f.write(data)
                f.flush()  # to be read by others
                req = {"command": "corpus_load", "files": [f.name]}
                try:
                    reply = network.send_and_receive(req)
                except GrewError:
                    raise GrewError(data)
        self._length = reply["length"]
        self._id = reply["index"]


[docs]
    def get_sent_ids(self):
        """
        return the list of sentence ids
        """
        req = {"command": "corpus_sent_ids", "corpus_index": self._id}
        return network.send_and_receive(req)



[docs]
    def get_id(self):
        """
        return the id of the corpus
        """
        return self._id



[docs]
    def clean(self):
        """
        clean the corpus (remove from the backend memory)
        """
        req = {"command": "corpus_clean", "corpus_index": self._id}
        return network.send_and_receive(req)



[docs]
    def get(self, sent_id):
        """
        return a graph corresponding to the sentence id sent_id
        """
        req = {"command": "corpus_get",
                   "corpus_index": self._id, "sent_id": sent_id}
        return (Graph.from_json(network.send_and_receive(req)))


    def __getitem__(self, data):
        """
        return a graph corresponding to data, either
          - a sentence id,
          - an index in the sentence id array
          - a slice
        """
        if isinstance(data, str):
            return self.get(data)
        if isinstance(data, int):
            sids = self.get_sent_ids()
            return self.get(sids[data])
        if isinstance(data, slice):
            sids = self.get_sent_ids()
            return [self[sid] for sid in sids[data]]



[docs]
    def get_all(self):
        """
        return a dictionary mapping sentence ids to graphs
        """
        dico = network.send_and_receive({"command": "corpus_get_all", "corpus_index": self._id})
        return {sid: Graph.from_json(json_data) for (sid,json_data) in dico.items() }




[docs]
    def search(self, request, clustering_parameter=[], clustering_keys=[], flat=None, deco=False, bound=None, timeout=None):
        """
        Search for [request] into [corpus_index]

        Parameters:
        request (Request): a request
        corpus_index: an integer given by the [corpus] function

        Returns:
        list: the list of matching of [request] into the corpus
        """
        res = network.send_and_receive({
            "command": "corpus_search",
            "corpus_index": self._id,
            "request": request.json_data(),
            "clustering_keys": clustering_parameter + clustering_keys,
            "build_deco": deco,
            "bound": bound,
            "timeout": timeout,
        })
        if flat == "matchings":
            return Matchings(res, self)
        elif flat == "observations" and clustering_parameter or clustering_keys:
            return Observation(res, clustering_parameter, clustering_keys)
        return res



[docs]
    def count(self, request, clustering_parameter=[], clustering_keys=[], flat=False):
        """
        Count for [request] into [corpus_index]
        :param request: a string request
        :param corpus_index: an integer given by the [corpus] function
        :return: the number of matching of [request] into the corpus
        """
        res = network.send_and_receive({
            "command": "corpus_count",
            "corpus_index": self._id,
            "request": request.json_data(),
            "clustering_keys": clustering_parameter + clustering_keys,
        })
        if not flat:
            return res
        if clustering_parameter or clustering_keys:
            return Observation(obs=res,parameter=clustering_parameter, keys=clustering_keys)
        return res


    def __len__(self):
        return self._length

    def __iter__(self):
        return iter(self.get_sent_ids())


[docs]
    def to_conll(self):
        """
        return a CoNLL string for the current corpus
        """
        reply = network.send_and_receive({
            "command": "corpus_to_conll",
            "corpus_index": self._id
        })
        return reply



[docs]
    def count_feature_values(self, include=None, exclude=["xpos","wordform","textform","SpaceAfter"]):
        """
        return a dict with feature names as key and subdict a value
        for each feature name, the subdict maps existing feature values with the number of occurrences
        if include is set as a list of string, only feature names in the list are taken into account
        else exclude defines the list of feature names in the list which are not taken into account
        """
        reply = network.send_and_receive({
            "command": "corpus_count_feature_values",
            "corpus_index": self._id,
            "include": include,
            "exclude": exclude,
        })
        return reply



[docs]
    def run(self, Grs, strat="main"):
        return Grs.run(self, strat)



[docs]
    def apply(self, Grs, strat="main"):
        return Grs.apply(self, strat)
Source code for grewpy.corpus

grewpy 0.6.0

Navigation

Related Topics