Accords des noms, conjugaison des verbes (WiP)

2024-09-09 00:28:50 +02:00 · 2024-09-09 00:28:50 +02:00 · 874329c982
commit 874329c982
parent d3fbd47037
1 changed files with 140 additions and 20 deletions
--- a/pwgen_fr/lexique.py
+++ b/pwgen_fr/lexique.py
@ -1,7 +1,10 @@
 import csv
+import itertools
+from dataclasses import dataclass
 import logging
 import subprocess
 import typing as t
+from bisect import bisect_left
 import enum
 from pathlib import Path

@ -27,13 +30,88 @@ class CatGram(enum.Enum):
        base = val.split(":", maxsplit=1)[0]
        return cls(base)

+    def __lt__(self, oth):
+        return self.value < oth.value

-class Word(t.NamedTuple):
-    word: str
-    lemme: str  # canonical form
+
+def match_enum_or_all(val, enum_cls) -> list:
+    """The value of the enum corresponding if any; else, all terms of the enum"""
+    if val in enum_cls:
+        return [enum_cls(val)]
+    return list(enum_cls)
+
+
+class Genre(enum.Enum):
+    MASC = "m"
+    FEM = "f"
+
+
+class Nombre(enum.Enum):
+    SING = "s"
+    PLUR = "p"
+
+
+class Temps(enum.Enum):
+    INFINITIF = "inf"
+    PRESENT = "ind:pre"
+    FUTUR = "ind:fut"
+    IMPARFAIT = "ind:imp"
+
+
+class Personne(enum.Enum):
+    S1 = "1s"
+    S2 = "2s"
+    S3 = "3s"
+    P1 = "1p"
+    P2 = "2p"
+    P3 = "3p"
+
+
+@dataclass
+class _Mot:
+    """Canonical form of a word"""
+
+    mot: str
    cat_gram: CatGram
-    freq_lem: float  # occurrences of the canonical form, in films, by million words
-    freq: float  # occurrences of this exact form, in films, by million words
+    freq: float  # occurrences of the canonical form by million words
+
+
+class Mot(_Mot):
+    class Variant:
+        pass
+
+    _for_cat_gram: dict[CatGram, t.Type["Mot"]] = {}
+    _variants: dict
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._variants = {}
+
+    def accord(self, variant: Variant) -> str:
+        return self._variants[variant]
+
+    @classmethod
+    def for_cat_gram(cls, cat_gram: CatGram) -> t.Type["Mot"]:
+        """The class to use for a word of given CatGram"""
+        return cls._for_cat_gram.get(cat_gram, cls)
+
+
+class Nom(Mot):
+    class Variant(t.NamedTuple):
+        genre: Genre
+        nombre: Nombre
+
+
+class Verbe(Mot):
+    class Variant(t.NamedTuple):
+        temps: Temps
+        personne: t.Optional[Personne]
+
+
+Mot._for_cat_gram = {
+    CatGram.NOM: Nom,
+    CatGram.VERBE: Verbe,
+}


 class Lexique:
@ -47,7 +125,7 @@ class Lexique:
        CatGram.ADVERBE: 10000,
    }

-    dataset: list[Word]
+    dataset: list[Mot]

    def __init__(self, dataset):
        self.dataset = dataset
@ -86,29 +164,71 @@ class Lexique:
    @classmethod
    def parse(cls) -> "Lexique":
        out = []
+        rows = []
        with cls.LEXIQUE_PATH.open("r") as h:
            reader = csv.DictReader(h, dialect="excel-tab")
            for row in reader:
                if not row["cgram"]:
                    continue
-                try:
-                    out.append(
-                        Word(
-                            word=row["ortho"],
-                            lemme=row["lemme"],
-                            cat_gram=CatGram.parse(row["cgram"]),
-                            freq_lem=float(row["freqlemlivres"]),
-                            freq=float(row["freqlivres"]),
-                        )
-                    )
-                except ValueError as exn:
-                    print(row)
-                    raise exn from exn
+                rows.append(row)
+
+        # First pass: generate canonical forms (lemmes)
+        for row in rows:
+            if row["lemme"] != row["ortho"]:
+                continue
+            cat_gram = CatGram.parse(row["cgram"])
+            out.append(
+                Mot.for_cat_gram(cat_gram)(
+                    mot=row["ortho"],
+                    cat_gram=cat_gram,
+                    freq=float(row["freqlemlivres"]),
+                )
+            )
+
+        out.sort(key=lambda x: (x.mot, x.cat_gram))  # We need to bisect on this.
+
+        # Second pass: populate variants
+        for row in rows:
+            str_lemme = row["lemme"]
+            cat_gram = CatGram.parse(row['cgram'])
+            lemme_pos = bisect_left(out, (str_lemme, cat_gram), key=lambda x: (x.mot, x.cat_gram))
+            if lemme_pos > len(out) or out[lemme_pos].mot != str_lemme:
+                continue  # Unknown word
+            lemme = out[lemme_pos]
+
+            if lemme.cat_gram == CatGram.NOM:
+                genres = match_enum_or_all(row["genre"], Genre)
+                nombres = match_enum_or_all(row["nombre"], Nombre)
+                for genre, nombre in itertools.product(genres, nombres):
+                    variant = Nom.Variant(genre=genre, nombre=nombre)
+                    lemme._variants[variant] = row["ortho"]
+
+            elif lemme.cat_gram == CatGram.VERBE:
+                infover = row["infover"].split(";")
+                for raw_ver in infover:
+                    ver = raw_ver.split(":")
+
+                    temps = None
+                    personne = None
+                    if ver[0] == "inf":
+                        temps = Temps(ver[0])
+                    elif ver[0] == "ind":
+                        temps_select = ":".join(ver[0:2])
+                        if temps_select not in Temps:
+                            continue
+                        temps = Temps(temps_select)
+                        personne = Personne(ver[2])
+                    else:
+                        continue
+
+                    variant = Verbe.Variant(temps=temps, personne=personne)
+                    lemme._variants[variant] = row["ortho"]
+
        return cls(out)

    def most_common(
        self, cat_gram: CatGram, threshold: t.Optional[int] = None
-    ) -> list[Word]:
+    ) -> list[Mot]:
        if threshold is None:
            try:
                threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram]