Accords des noms, conjugaison des verbes (WiP)
This commit is contained in:
parent
d3fbd47037
commit
874329c982
1 changed files with 140 additions and 20 deletions
|
@ -1,7 +1,10 @@
|
||||||
import csv
|
import csv
|
||||||
|
import itertools
|
||||||
|
from dataclasses import dataclass
|
||||||
import logging
|
import logging
|
||||||
import subprocess
|
import subprocess
|
||||||
import typing as t
|
import typing as t
|
||||||
|
from bisect import bisect_left
|
||||||
import enum
|
import enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
@ -27,13 +30,88 @@ class CatGram(enum.Enum):
|
||||||
base = val.split(":", maxsplit=1)[0]
|
base = val.split(":", maxsplit=1)[0]
|
||||||
return cls(base)
|
return cls(base)
|
||||||
|
|
||||||
|
def __lt__(self, oth):
|
||||||
|
return self.value < oth.value
|
||||||
|
|
||||||
class Word(t.NamedTuple):
|
|
||||||
word: str
|
def match_enum_or_all(val, enum_cls) -> list:
|
||||||
lemme: str # canonical form
|
"""The value of the enum corresponding if any; else, all terms of the enum"""
|
||||||
|
if val in enum_cls:
|
||||||
|
return [enum_cls(val)]
|
||||||
|
return list(enum_cls)
|
||||||
|
|
||||||
|
|
||||||
|
class Genre(enum.Enum):
|
||||||
|
MASC = "m"
|
||||||
|
FEM = "f"
|
||||||
|
|
||||||
|
|
||||||
|
class Nombre(enum.Enum):
|
||||||
|
SING = "s"
|
||||||
|
PLUR = "p"
|
||||||
|
|
||||||
|
|
||||||
|
class Temps(enum.Enum):
|
||||||
|
INFINITIF = "inf"
|
||||||
|
PRESENT = "ind:pre"
|
||||||
|
FUTUR = "ind:fut"
|
||||||
|
IMPARFAIT = "ind:imp"
|
||||||
|
|
||||||
|
|
||||||
|
class Personne(enum.Enum):
|
||||||
|
S1 = "1s"
|
||||||
|
S2 = "2s"
|
||||||
|
S3 = "3s"
|
||||||
|
P1 = "1p"
|
||||||
|
P2 = "2p"
|
||||||
|
P3 = "3p"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class _Mot:
|
||||||
|
"""Canonical form of a word"""
|
||||||
|
|
||||||
|
mot: str
|
||||||
cat_gram: CatGram
|
cat_gram: CatGram
|
||||||
freq_lem: float # occurrences of the canonical form, in films, by million words
|
freq: float # occurrences of the canonical form by million words
|
||||||
freq: float # occurrences of this exact form, in films, by million words
|
|
||||||
|
|
||||||
|
class Mot(_Mot):
|
||||||
|
class Variant:
|
||||||
|
pass
|
||||||
|
|
||||||
|
_for_cat_gram: dict[CatGram, t.Type["Mot"]] = {}
|
||||||
|
_variants: dict
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self._variants = {}
|
||||||
|
|
||||||
|
def accord(self, variant: Variant) -> str:
|
||||||
|
return self._variants[variant]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def for_cat_gram(cls, cat_gram: CatGram) -> t.Type["Mot"]:
|
||||||
|
"""The class to use for a word of given CatGram"""
|
||||||
|
return cls._for_cat_gram.get(cat_gram, cls)
|
||||||
|
|
||||||
|
|
||||||
|
class Nom(Mot):
|
||||||
|
class Variant(t.NamedTuple):
|
||||||
|
genre: Genre
|
||||||
|
nombre: Nombre
|
||||||
|
|
||||||
|
|
||||||
|
class Verbe(Mot):
|
||||||
|
class Variant(t.NamedTuple):
|
||||||
|
temps: Temps
|
||||||
|
personne: t.Optional[Personne]
|
||||||
|
|
||||||
|
|
||||||
|
Mot._for_cat_gram = {
|
||||||
|
CatGram.NOM: Nom,
|
||||||
|
CatGram.VERBE: Verbe,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class Lexique:
|
class Lexique:
|
||||||
|
@ -47,7 +125,7 @@ class Lexique:
|
||||||
CatGram.ADVERBE: 10000,
|
CatGram.ADVERBE: 10000,
|
||||||
}
|
}
|
||||||
|
|
||||||
dataset: list[Word]
|
dataset: list[Mot]
|
||||||
|
|
||||||
def __init__(self, dataset):
|
def __init__(self, dataset):
|
||||||
self.dataset = dataset
|
self.dataset = dataset
|
||||||
|
@ -86,29 +164,71 @@ class Lexique:
|
||||||
@classmethod
|
@classmethod
|
||||||
def parse(cls) -> "Lexique":
|
def parse(cls) -> "Lexique":
|
||||||
out = []
|
out = []
|
||||||
|
rows = []
|
||||||
with cls.LEXIQUE_PATH.open("r") as h:
|
with cls.LEXIQUE_PATH.open("r") as h:
|
||||||
reader = csv.DictReader(h, dialect="excel-tab")
|
reader = csv.DictReader(h, dialect="excel-tab")
|
||||||
for row in reader:
|
for row in reader:
|
||||||
if not row["cgram"]:
|
if not row["cgram"]:
|
||||||
continue
|
continue
|
||||||
try:
|
rows.append(row)
|
||||||
|
|
||||||
|
# First pass: generate canonical forms (lemmes)
|
||||||
|
for row in rows:
|
||||||
|
if row["lemme"] != row["ortho"]:
|
||||||
|
continue
|
||||||
|
cat_gram = CatGram.parse(row["cgram"])
|
||||||
out.append(
|
out.append(
|
||||||
Word(
|
Mot.for_cat_gram(cat_gram)(
|
||||||
word=row["ortho"],
|
mot=row["ortho"],
|
||||||
lemme=row["lemme"],
|
cat_gram=cat_gram,
|
||||||
cat_gram=CatGram.parse(row["cgram"]),
|
freq=float(row["freqlemlivres"]),
|
||||||
freq_lem=float(row["freqlemlivres"]),
|
|
||||||
freq=float(row["freqlivres"]),
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
except ValueError as exn:
|
|
||||||
print(row)
|
out.sort(key=lambda x: (x.mot, x.cat_gram)) # We need to bisect on this.
|
||||||
raise exn from exn
|
|
||||||
|
# Second pass: populate variants
|
||||||
|
for row in rows:
|
||||||
|
str_lemme = row["lemme"]
|
||||||
|
cat_gram = CatGram.parse(row['cgram'])
|
||||||
|
lemme_pos = bisect_left(out, (str_lemme, cat_gram), key=lambda x: (x.mot, x.cat_gram))
|
||||||
|
if lemme_pos > len(out) or out[lemme_pos].mot != str_lemme:
|
||||||
|
continue # Unknown word
|
||||||
|
lemme = out[lemme_pos]
|
||||||
|
|
||||||
|
if lemme.cat_gram == CatGram.NOM:
|
||||||
|
genres = match_enum_or_all(row["genre"], Genre)
|
||||||
|
nombres = match_enum_or_all(row["nombre"], Nombre)
|
||||||
|
for genre, nombre in itertools.product(genres, nombres):
|
||||||
|
variant = Nom.Variant(genre=genre, nombre=nombre)
|
||||||
|
lemme._variants[variant] = row["ortho"]
|
||||||
|
|
||||||
|
elif lemme.cat_gram == CatGram.VERBE:
|
||||||
|
infover = row["infover"].split(";")
|
||||||
|
for raw_ver in infover:
|
||||||
|
ver = raw_ver.split(":")
|
||||||
|
|
||||||
|
temps = None
|
||||||
|
personne = None
|
||||||
|
if ver[0] == "inf":
|
||||||
|
temps = Temps(ver[0])
|
||||||
|
elif ver[0] == "ind":
|
||||||
|
temps_select = ":".join(ver[0:2])
|
||||||
|
if temps_select not in Temps:
|
||||||
|
continue
|
||||||
|
temps = Temps(temps_select)
|
||||||
|
personne = Personne(ver[2])
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
variant = Verbe.Variant(temps=temps, personne=personne)
|
||||||
|
lemme._variants[variant] = row["ortho"]
|
||||||
|
|
||||||
return cls(out)
|
return cls(out)
|
||||||
|
|
||||||
def most_common(
|
def most_common(
|
||||||
self, cat_gram: CatGram, threshold: t.Optional[int] = None
|
self, cat_gram: CatGram, threshold: t.Optional[int] = None
|
||||||
) -> list[Word]:
|
) -> list[Mot]:
|
||||||
if threshold is None:
|
if threshold is None:
|
||||||
try:
|
try:
|
||||||
threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram]
|
threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram]
|
||||||
|
|
Loading…
Reference in a new issue