pwgen-fr/pwgen_fr/lexique.py

242 lines
6.7 KiB
Python

import csv
import itertools
from dataclasses import dataclass
import logging
import subprocess
import typing as t
from bisect import bisect_left
import enum
from pathlib import Path
logger = logging.getLogger(__name__)
class CatGram(enum.Enum):
NOM = "NOM"
VERBE = "VER"
ADJECTIF = "ADJ"
ADVERBE = "ADV"
AUXILIAIRE = "AUX"
ARTICLE = "ART"
CONJONCTION = "CON"
LIAISON = "LIA"
PREPOSITION = "PRE"
PRONOM = "PRO"
ONOMATOPEE = "ONO"
@classmethod
def parse(cls, val: str) -> "CatGram":
"""Parses a 'catgram' entry"""
base = val.split(":", maxsplit=1)[0]
return cls(base)
def __lt__(self, oth):
return self.value < oth.value
def match_enum_or_all(val, enum_cls) -> list:
"""The value of the enum corresponding if any; else, all terms of the enum"""
if val in enum_cls:
return [enum_cls(val)]
return list(enum_cls)
class Genre(enum.Enum):
MASC = "m"
FEM = "f"
class Nombre(enum.Enum):
SING = "s"
PLUR = "p"
class Temps(enum.Enum):
INFINITIF = "inf"
PRESENT = "ind:pre"
FUTUR = "ind:fut"
IMPARFAIT = "ind:imp"
class Personne(enum.Enum):
S1 = "1s"
S2 = "2s"
S3 = "3s"
P1 = "1p"
P2 = "2p"
P3 = "3p"
@dataclass
class _Mot:
"""Canonical form of a word"""
mot: str
cat_gram: CatGram
freq: float # occurrences of the canonical form by million words
class Mot(_Mot):
class Variant:
pass
_for_cat_gram: dict[CatGram, t.Type["Mot"]] = {}
_variants: dict
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._variants = {}
def accord(self, variant: Variant) -> str:
return self._variants[variant]
@classmethod
def for_cat_gram(cls, cat_gram: CatGram) -> t.Type["Mot"]:
"""The class to use for a word of given CatGram"""
return cls._for_cat_gram.get(cat_gram, cls)
class Nom(Mot):
class Variant(t.NamedTuple):
genre: Genre
nombre: Nombre
class Verbe(Mot):
class Variant(t.NamedTuple):
temps: Temps
personne: t.Optional[Personne]
Mot._for_cat_gram = {
CatGram.NOM: Nom,
CatGram.VERBE: Verbe,
}
class Lexique:
LEXIQUE_DIR_PATH = Path(__file__).parent.parent / "data/raw/Lexique383"
LEXIQUE_PATH = LEXIQUE_DIR_PATH / "Lexique383.tsv"
PRESET_THRESHOLD_BY_CAT: dict[CatGram, int] = {
CatGram.NOM: 10000,
CatGram.VERBE: 10000,
CatGram.ADJECTIF: 10000,
CatGram.ADVERBE: 10000,
}
dataset: list[Mot]
def __init__(self, dataset):
self.dataset = dataset
@classmethod
def _ensure_uncompressed(cls):
"""Ensures the dataset is uncompressed"""
if cls.LEXIQUE_DIR_PATH.exists():
return
lexique_archive = cls.LEXIQUE_DIR_PATH.with_suffix(".tar.xz")
if not lexique_archive.exists():
logging.error("Missing compressed dataset at %s", lexique_archive)
raise Exception(f"Missing compressed dataset at {lexique_archive}")
logging.info("Uncompressing dataset")
subprocess.check_call(
[
"tar",
"-xJf",
lexique_archive.as_posix(),
"-C",
lexique_archive.parent.as_posix(),
]
)
if not cls.LEXIQUE_DIR_PATH.exists():
logging.error(
"Uncompressed dataset still missing at %s after extraction",
cls.LEXIQUE_DIR_PATH,
)
raise Exception(
f"Uncompressed dataset still missing at {cls.LEXIQUE_DIR_PATH} after extraction"
)
@classmethod
def parse(cls) -> "Lexique":
out = []
rows = []
with cls.LEXIQUE_PATH.open("r") as h:
reader = csv.DictReader(h, dialect="excel-tab")
for row in reader:
if not row["cgram"]:
continue
rows.append(row)
# First pass: generate canonical forms (lemmes)
for row in rows:
if row["lemme"] != row["ortho"]:
continue
cat_gram = CatGram.parse(row["cgram"])
out.append(
Mot.for_cat_gram(cat_gram)(
mot=row["ortho"],
cat_gram=cat_gram,
freq=float(row["freqlemlivres"]),
)
)
out.sort(key=lambda x: (x.mot, x.cat_gram)) # We need to bisect on this.
# Second pass: populate variants
for row in rows:
str_lemme = row["lemme"]
cat_gram = CatGram.parse(row['cgram'])
lemme_pos = bisect_left(out, (str_lemme, cat_gram), key=lambda x: (x.mot, x.cat_gram))
if lemme_pos > len(out) or out[lemme_pos].mot != str_lemme:
continue # Unknown word
lemme = out[lemme_pos]
if lemme.cat_gram == CatGram.NOM:
genres = match_enum_or_all(row["genre"], Genre)
nombres = match_enum_or_all(row["nombre"], Nombre)
for genre, nombre in itertools.product(genres, nombres):
variant = Nom.Variant(genre=genre, nombre=nombre)
lemme._variants[variant] = row["ortho"]
elif lemme.cat_gram == CatGram.VERBE:
infover = row["infover"].split(";")
for raw_ver in infover:
ver = raw_ver.split(":")
temps = None
personne = None
if ver[0] == "inf":
temps = Temps(ver[0])
elif ver[0] == "ind":
temps_select = ":".join(ver[0:2])
if temps_select not in Temps:
continue
temps = Temps(temps_select)
personne = Personne(ver[2])
else:
continue
variant = Verbe.Variant(temps=temps, personne=personne)
lemme._variants[variant] = row["ortho"]
return cls(out)
def most_common(
self, cat_gram: CatGram, threshold: t.Optional[int] = None
) -> list[Mot]:
if threshold is None:
try:
threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram]
except KeyError as exn:
raise ValueError(
f"No threshold preset for grammatical category {cat_gram}, "
"please provide a threshold manually"
) from exn
out = list(filter(lambda word: word.cat_gram == cat_gram, self.dataset))
out.sort(key=lambda word: word.freq, reverse=True)
return out[:threshold]