word_db: serialize correctly

Parse morphalou as word_db
2024-09-16 18:41:17 +02:00 · 2024-09-16 18:41:17 +02:00
2 changed files with 261 additions and 2 deletions
--- a/pwgen_fr/morphalou.py
+++ b/pwgen_fr/morphalou.py
@ -0,0 +1,218 @@
+""" Reads the Morphalou dataset, in its TSV form """
+
+import typing as t
+from lxml import etree
+from pathlib import Path
+import itertools
+
+from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
+
+TSV_NS = {
+    "tsv": "http://www.tei-c.org/ns/1.0",
+    "xml": "http://www.w3.org/XML/1998/namespace",
+}
+
+
+class MorphalouSet:
+    MORPHALOU_DIR_PATH = (
+        Path(__file__).parent.parent
+        / "data/raw/morphalou/morphalou/5/Morphalou3.1_formatTEI"
+    )
+    MORPHALOU_FILENAME_TEMPLATE = "{cat_name}_Morphalou3.1_TEI.xml"
+
+    CAT_MAPPING: dict[t.Type[t.NamedTuple], str] = {
+        Nom: "commonNoun",
+        Adjectif: "adjective",
+        Verbe: "verb",
+        Adverbe: "adverb",
+    }
+
+    word_db: WordDb
+
+    def __init__(self):
+        self.word_db = WordDb()
+
+    def parse(self):
+        """Parses the dataset"""
+        for cat, cat_file in self.__class__.CAT_MAPPING.items():
+            word_db_elt = WordDb.CATEGORY_TO_ATTR[cat]
+            setattr(
+                self.word_db,
+                word_db_elt,
+                getattr(self, f"_parse_{word_db_elt}")(
+                    self.__class__.MORPHALOU_DIR_PATH
+                    / self.__class__.MORPHALOU_FILENAME_TEMPLATE.format(
+                        cat_name=cat_file
+                    )
+                ),
+            )
+
+    def _tsv_elems(self, tsv_path: Path):
+        """Opens a TSV file, and returns the <body> node, direct parent of all the
+        relevant nodes"""
+        with tsv_path.open("r") as h:
+            tree = etree.parse(h)
+        root = tree.getroot()
+        body = root.find("./tsv:text/tsv:body", TSV_NS)
+        return body
+
+    def _parse_noms(self, tsv_path: Path) -> list[Nom]:
+        """Parse the nouns"""
+        root = self._tsv_elems(tsv_path)
+        out: list[Nom] = []
+
+        for entry in root.iterfind("./tsv:entry", TSV_NS):
+            try:
+                genre = self._genre(
+                    entry.find(
+                        "./tsv:form[@type='lemma']/tsv:gramGrp/tsv:gen", TSV_NS
+                    ).text
+                )
+            except AttributeError:
+                continue  # some nouns don't have a gender defined, somehow -- ignore
+
+            forms = {}
+            for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
+                orth = inflected.find("./tsv:orth", TSV_NS).text
+                nombres = self._nombre_set(
+                    inflected.find("./tsv:gramGrp/tsv:number", TSV_NS).text
+                )
+                for form in nombres:
+                    forms[form] = orth
+            try:
+                out.append(
+                    Nom(
+                        genre=genre,
+                        sing=forms[Nombre.SING],
+                        plur=forms[Nombre.PLUR],
+                    )
+                )
+            except KeyError:
+                continue  # cannot be inflected to all required forms: skip
+
+        return out
+
+    def _parse_adjectifs(self, tsv_path: Path) -> list[Adjectif]:
+        """Parse the adjectives"""
+        root = self._tsv_elems(tsv_path)
+        out: list[Adjectif] = []
+
+        for entry in root.iterfind("./tsv:entry", TSV_NS):
+            forms = {}
+            for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
+                orth = inflected.find("./tsv:orth", TSV_NS).text
+                gram_grp = inflected.find("./tsv:gramGrp", TSV_NS)
+                genres = self._genre_set(gram_grp.find("./tsv:gen", TSV_NS).text)
+                nombres = self._nombre_set(gram_grp.find("./tsv:number", TSV_NS).text)
+
+                for form in itertools.product(genres, nombres):
+                    forms[form] = orth
+            try:
+                out.append(
+                    Adjectif(
+                        masc_sing=forms[Genre.MASC, Nombre.SING],
+                        masc_plur=forms[Genre.MASC, Nombre.PLUR],
+                        fem_sing=forms[Genre.FEM, Nombre.SING],
+                        fem_plur=forms[Genre.FEM, Nombre.PLUR],
+                    )
+                )
+            except KeyError:
+                continue  # cannot be inflected to all required forms: skip
+
+        return out
+
+    def _parse_verbes(self, tsv_path: Path) -> list[Verbe]:
+        """Parse the verbs"""
+        root = self._tsv_elems(tsv_path)
+        out: list[Verbe] = []
+
+        for entry in root.iterfind("./tsv:entry", TSV_NS):
+            forms = {}
+            for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
+                gram_grp = inflected.find("./tsv:gramGrp", TSV_NS)
+
+                # Order of tests is important! If mood == 'participle', there is no
+                # 'person' defined.
+                if (
+                    gram_grp.find("./tsv:mood", TSV_NS).text != "indicative"
+                    or gram_grp.find("./tsv:per", TSV_NS).text != "thirdPerson"
+                ):
+                    continue  # irrelevant for us
+
+                temps = self._tense(gram_grp.find("./tsv:tns", TSV_NS).text)
+                if temps is None:
+                    continue  # irrelevant for us
+
+                nombres = self._nombre_set(gram_grp.find("./tsv:number", TSV_NS).text)
+
+                orth = inflected.find("./tsv:orth", TSV_NS).text
+                for nombre in nombres:
+                    forms[(temps, nombre)] = orth
+            try:
+                out.append(
+                    Verbe(
+                        present_sing=forms[Temps.PRESENT, Nombre.SING],
+                        present_plur=forms[Temps.PRESENT, Nombre.PLUR],
+                        futur_sing=forms[Temps.FUTUR, Nombre.SING],
+                        futur_plur=forms[Temps.FUTUR, Nombre.PLUR],
+                        imparfait_sing=forms[Temps.IMPARFAIT, Nombre.SING],
+                        imparfait_plur=forms[Temps.IMPARFAIT, Nombre.PLUR],
+                    )
+                )
+            except KeyError:
+                continue  # cannot be inflected to all required forms: skip
+
+        return out
+
+    def _parse_adverbes(self, tsv_path: Path) -> list[Adverbe]:
+        """Parse the adverbs"""
+        root = self._tsv_elems(tsv_path)
+        out: list[Adverbe] = []
+
+        for entry in root.iterfind("./tsv:entry", TSV_NS):
+            # We're only interested in the lemma form
+            orth = entry.find("./tsv:form[@type='lemma']/tsv:orth", TSV_NS)
+            assert orth is not None
+            adv = orth.text
+            out.append(Adverbe(adv=adv))
+
+        return out
+
+    @staticmethod
+    def _genre_set(genre: str) -> list[Genre]:
+        return {
+            "masculine": [Genre.MASC],
+            "feminine": [Genre.FEM],
+            "invariable": [Genre.MASC, Genre.FEM],
+        }[genre]
+
+    @staticmethod
+    def _genre(genre: str) -> Genre:
+        return {
+            "masculine": Genre.MASC,
+            "feminine": Genre.FEM,
+            "invariable": Genre.INV,
+        }[genre]
+
+    @staticmethod
+    def _nombre(nombre: str) -> Nombre:
+        return {
+            "singular": Nombre.SING,
+            "plural": Nombre.PLUR,
+        }[nombre]
+
+    @staticmethod
+    def _nombre_set(nombre: str) -> list[Nombre]:
+        return {
+            "singular": [Nombre.SING],
+            "plural": [Nombre.PLUR],
+            "invariable": [Nombre.SING, Nombre.PLUR],
+        }[nombre]
+
+    @staticmethod
+    def _tense(tense: str) -> t.Optional[Temps]:
+        return {
+            "present": Temps.PRESENT,
+            "imperfect": Temps.IMPARFAIT,
+            "future": Temps.FUTUR,
+        }.get(tense, None)
--- a/pwgen_fr/word_db.py
+++ b/pwgen_fr/word_db.py
@ -8,6 +8,7 @@ import json
 class Genre(Enum):
    MASC = "masculin"
    FEM = "féminin"
+    INV = "invariable"  # pour les noms uniquement


 class Nombre(Enum):
@ -35,6 +36,15 @@ class Nom(t.NamedTuple):
        """Accorde en nombre"""
        return getattr(self, nombre.name.lower())

+    @property
+    def serialized(self):
+        return {"genre": self.genre.name, "sing": self.sing, "plur": self.plur}
+
+    @classmethod
+    def unserialized(cls, **kwargs):
+        genre = Genre(kwargs.pop("genre"))
+        return cls(**kwargs, genre=genre)
+

 class Adjectif(t.NamedTuple):
    masc_sing: str
@ -49,6 +59,14 @@ class Adjectif(t.NamedTuple):
        """Accorde en genre et en nombre"""
        return getattr(self, f"{genre.name.lower()}_{nombre.name.lower()}")

+    @property
+    def serialized(self):
+        return self._asdict()
+
+    @classmethod
+    def unserialized(cls, **kwargs):
+        return cls(**kwargs)
+

 class Verbe(t.NamedTuple):
    present_sing: str
@ -65,6 +83,14 @@ class Verbe(t.NamedTuple):
        """Accorde en temps et en nombre (seule la 3è pers. est utilisée)"""
        return getattr(self, f"{temps.name.lower()}_{nombre.name.lower()}")

+    @property
+    def serialized(self):
+        return self._asdict()
+
+    @classmethod
+    def unserialized(cls, **kwargs):
+        return cls(**kwargs)
+

 class Adverbe(t.NamedTuple):
    """Packed as named tuple for consistence"""
@ -78,6 +104,14 @@ class Adverbe(t.NamedTuple):
        """for consistence"""
        return self.adv

+    @property
+    def serialized(self):
+        return self._asdict()
+
+    @classmethod
+    def unserialized(cls, **kwargs):
+        return cls(**kwargs)
+

 class WordDb:
    """Base de donnée de mots, sérialisable"""
@ -89,6 +123,13 @@ class WordDb:
        "adverbes": Adverbe,
    }

+    CATEGORY_TO_ATTR: dict = {
+        Nom: "noms",
+        Adjectif: "adjectifs",
+        Verbe: "verbes",
+        Adverbe: "adverbes",
+    }
+
    noms: list[Nom]
    adjectifs: list[Adjectif]
    verbes: list[Verbe]
@ -109,7 +150,7 @@ class WordDb:
    def serialize(self) -> dict:
        """Serialize to plain dictionary (no classes)"""
        return {
-            attr: [x._asdict() for x in getattr(self, attr)]
+            attr: [x.serialized for x in getattr(self, attr)]
            for attr in self.__class__._serialize_data
        }

@ -123,7 +164,7 @@ class WordDb:
        """Reverses :serialize:"""
        parsed = {}
        for attr, attr_cls in cls._serialize_data.items():
-            parsed[attr] = list(map(attr_cls, data[attr]))
+            parsed[attr] = list(map(attr_cls.unserialized, data[attr]))
        return cls(**parsed)

    @classmethod
Author	SHA1	Message	Date
Théophile Bastian	a9c3c90405	word_db: serialize correctly	2024-09-16 18:41:17 +02:00
Théophile Bastian	b086b9a08d	Parse morphalou as word_db	2024-09-16 18:41:17 +02:00