Accords des noms, conjugaison des verbes (WiP)

This commit is contained in:
Théophile Bastian 2024-09-09 00:28:50 +02:00
parent d3fbd47037
commit 874329c982

View file

@ -1,7 +1,10 @@
import csv import csv
import itertools
from dataclasses import dataclass
import logging import logging
import subprocess import subprocess
import typing as t import typing as t
from bisect import bisect_left
import enum import enum
from pathlib import Path from pathlib import Path
@ -27,13 +30,88 @@ class CatGram(enum.Enum):
base = val.split(":", maxsplit=1)[0] base = val.split(":", maxsplit=1)[0]
return cls(base) return cls(base)
def __lt__(self, oth):
return self.value < oth.value
class Word(t.NamedTuple):
word: str def match_enum_or_all(val, enum_cls) -> list:
lemme: str # canonical form """The value of the enum corresponding if any; else, all terms of the enum"""
if val in enum_cls:
return [enum_cls(val)]
return list(enum_cls)
class Genre(enum.Enum):
MASC = "m"
FEM = "f"
class Nombre(enum.Enum):
SING = "s"
PLUR = "p"
class Temps(enum.Enum):
INFINITIF = "inf"
PRESENT = "ind:pre"
FUTUR = "ind:fut"
IMPARFAIT = "ind:imp"
class Personne(enum.Enum):
S1 = "1s"
S2 = "2s"
S3 = "3s"
P1 = "1p"
P2 = "2p"
P3 = "3p"
@dataclass
class _Mot:
"""Canonical form of a word"""
mot: str
cat_gram: CatGram cat_gram: CatGram
freq_lem: float # occurrences of the canonical form, in films, by million words freq: float # occurrences of the canonical form by million words
freq: float # occurrences of this exact form, in films, by million words
class Mot(_Mot):
class Variant:
pass
_for_cat_gram: dict[CatGram, t.Type["Mot"]] = {}
_variants: dict
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._variants = {}
def accord(self, variant: Variant) -> str:
return self._variants[variant]
@classmethod
def for_cat_gram(cls, cat_gram: CatGram) -> t.Type["Mot"]:
"""The class to use for a word of given CatGram"""
return cls._for_cat_gram.get(cat_gram, cls)
class Nom(Mot):
class Variant(t.NamedTuple):
genre: Genre
nombre: Nombre
class Verbe(Mot):
class Variant(t.NamedTuple):
temps: Temps
personne: t.Optional[Personne]
Mot._for_cat_gram = {
CatGram.NOM: Nom,
CatGram.VERBE: Verbe,
}
class Lexique: class Lexique:
@ -47,7 +125,7 @@ class Lexique:
CatGram.ADVERBE: 10000, CatGram.ADVERBE: 10000,
} }
dataset: list[Word] dataset: list[Mot]
def __init__(self, dataset): def __init__(self, dataset):
self.dataset = dataset self.dataset = dataset
@ -86,29 +164,71 @@ class Lexique:
@classmethod @classmethod
def parse(cls) -> "Lexique": def parse(cls) -> "Lexique":
out = [] out = []
rows = []
with cls.LEXIQUE_PATH.open("r") as h: with cls.LEXIQUE_PATH.open("r") as h:
reader = csv.DictReader(h, dialect="excel-tab") reader = csv.DictReader(h, dialect="excel-tab")
for row in reader: for row in reader:
if not row["cgram"]: if not row["cgram"]:
continue continue
try: rows.append(row)
# First pass: generate canonical forms (lemmes)
for row in rows:
if row["lemme"] != row["ortho"]:
continue
cat_gram = CatGram.parse(row["cgram"])
out.append( out.append(
Word( Mot.for_cat_gram(cat_gram)(
word=row["ortho"], mot=row["ortho"],
lemme=row["lemme"], cat_gram=cat_gram,
cat_gram=CatGram.parse(row["cgram"]), freq=float(row["freqlemlivres"]),
freq_lem=float(row["freqlemlivres"]),
freq=float(row["freqlivres"]),
) )
) )
except ValueError as exn:
print(row) out.sort(key=lambda x: (x.mot, x.cat_gram)) # We need to bisect on this.
raise exn from exn
# Second pass: populate variants
for row in rows:
str_lemme = row["lemme"]
cat_gram = CatGram.parse(row['cgram'])
lemme_pos = bisect_left(out, (str_lemme, cat_gram), key=lambda x: (x.mot, x.cat_gram))
if lemme_pos > len(out) or out[lemme_pos].mot != str_lemme:
continue # Unknown word
lemme = out[lemme_pos]
if lemme.cat_gram == CatGram.NOM:
genres = match_enum_or_all(row["genre"], Genre)
nombres = match_enum_or_all(row["nombre"], Nombre)
for genre, nombre in itertools.product(genres, nombres):
variant = Nom.Variant(genre=genre, nombre=nombre)
lemme._variants[variant] = row["ortho"]
elif lemme.cat_gram == CatGram.VERBE:
infover = row["infover"].split(";")
for raw_ver in infover:
ver = raw_ver.split(":")
temps = None
personne = None
if ver[0] == "inf":
temps = Temps(ver[0])
elif ver[0] == "ind":
temps_select = ":".join(ver[0:2])
if temps_select not in Temps:
continue
temps = Temps(temps_select)
personne = Personne(ver[2])
else:
continue
variant = Verbe.Variant(temps=temps, personne=personne)
lemme._variants[variant] = row["ortho"]
return cls(out) return cls(out)
def most_common( def most_common(
self, cat_gram: CatGram, threshold: t.Optional[int] = None self, cat_gram: CatGram, threshold: t.Optional[int] = None
) -> list[Word]: ) -> list[Mot]:
if threshold is None: if threshold is None:
try: try:
threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram] threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram]