pwgen-fr/pwgen_fr/lexique.py

300 lines
9.7 KiB
Python

import csv
import itertools
from dataclasses import dataclass, field
import logging
import subprocess
import typing as t
from bisect import bisect_left
import enum
from pathlib import Path
from .word_db import Genre, Nombre, Temps, Nom, Adjectif, Verbe, Adverbe, WordDb
logger = logging.getLogger(__name__)
class CatGram(enum.Enum):
NOM = "NOM"
VERBE = "VER"
ADJECTIF = "ADJ"
ADVERBE = "ADV"
AUXILIAIRE = "AUX"
ARTICLE = "ART"
CONJONCTION = "CON"
LIAISON = "LIA"
PREPOSITION = "PRE"
PRONOM = "PRO"
ONOMATOPEE = "ONO"
@classmethod
def parse(cls, val: str) -> "CatGram":
"""Parses a 'catgram' entry"""
base = val.split(":", maxsplit=1)[0]
return cls(base)
def __lt__(self, oth):
return self.value < oth.value
def match_enum_or_all(val: str, enum_mapper, enum_cls) -> list:
"""The value of the enum corresponding if any; else, all terms of the enum"""
if val in enum_mapper:
return [enum_mapper[val]]
return list(enum_cls)
@dataclass
class Mot:
mot: str
lemme: str
cat_gram: CatGram
freq: float # occurrences of the canonical form by million words
variantes: dict[tuple, str] = field(default_factory=dict)
genre: t.Optional[Genre] = None
class Lexique:
LEXIQUE_DIR_PATH = Path(__file__).parent.parent / "data/raw/Lexique383"
LEXIQUE_PATH = LEXIQUE_DIR_PATH / "Lexique383.tsv"
PRESET_THRESHOLD_BY_CAT: dict[CatGram, int] = {
CatGram.NOM: 10000,
CatGram.VERBE: 10000,
CatGram.ADJECTIF: 10000,
CatGram.ADVERBE: 10000,
}
class Parsers:
"""Datatables to help parse the original data"""
genre: dict[str, Genre] = {
"m": Genre.MASC,
"f": Genre.FEM,
}
rev_genre: dict[t.Optional[Genre], str] = {
None: "",
Genre.MASC: "m",
Genre.FEM: "f",
}
nombre: dict[str, Nombre] = {
"s": Nombre.SING,
"p": Nombre.PLUR,
}
verbe_temps: dict[str, Temps] = {
"ind:pre": Temps.PRESENT,
"ind:fut": Temps.FUTUR,
"ind:imp": Temps.IMPARFAIT,
}
verbe_personne: dict[str, Nombre] = {
"3s": Nombre.SING,
"3p": Nombre.PLUR,
}
dataset: list[Mot]
lemfreq: dict[str, float]
def __init__(self, dataset, lemfreq):
self.dataset = dataset
self.lemfreq = lemfreq
@classmethod
def _ensure_uncompressed(cls):
"""Ensures the dataset is uncompressed"""
if cls.LEXIQUE_DIR_PATH.exists():
return
lexique_archive = cls.LEXIQUE_DIR_PATH.with_suffix(".tar.xz")
if not lexique_archive.exists():
logging.error("Missing compressed dataset at %s", lexique_archive)
raise Exception(f"Missing compressed dataset at {lexique_archive}")
logging.info("Uncompressing dataset")
subprocess.check_call(
[
"tar",
"-xJf",
lexique_archive.as_posix(),
"-C",
lexique_archive.parent.as_posix(),
]
)
if not cls.LEXIQUE_DIR_PATH.exists():
logging.error(
"Uncompressed dataset still missing at %s after extraction",
cls.LEXIQUE_DIR_PATH,
)
raise Exception(
f"Uncompressed dataset still missing at {cls.LEXIQUE_DIR_PATH} after extraction"
)
@classmethod
def _find_word_key(cls, mot: Mot):
return (mot.lemme, mot.cat_gram, cls.Parsers.rev_genre[mot.genre])
@classmethod
def _find_word(cls, dataset: list[Mot], row: dict) -> t.Optional[Mot]:
str_lemme = row["lemme"]
cat_gram = CatGram.parse(row["cgram"])
genre = row["genre"] if cat_gram == CatGram.NOM else ""
row_key = (
str_lemme,
cat_gram,
genre,
)
lemme_pos = bisect_left(
dataset,
row_key,
key=cls._find_word_key,
)
if lemme_pos >= len(dataset):
return None
out = dataset[lemme_pos]
if row_key != cls._find_word_key(out):
return None
return dataset[lemme_pos]
@classmethod
def parse(cls) -> "Lexique":
out = []
rows = []
lemfreq: dict[str, float] = {}
with cls.LEXIQUE_PATH.open("r") as h:
reader = csv.DictReader(h, dialect="excel-tab")
for row in reader:
if not row["cgram"]:
continue
rows.append(row)
# First pass: generate canonical forms (lemmes)
for row in rows:
cat_gram = CatGram.parse(row["cgram"])
if (row["lemme"] != row["ortho"]) and not (
cat_gram == CatGram.NOM and row["genre"] == "f" and row["nombre"] == "s"
):
# Un nom singulier féminin est considéré comme forme canonique
continue
genre: t.Optional[Genre] = None
if cat_gram == CatGram.NOM:
genre = cls.Parsers.genre.get(row["genre"], None)
out.append(
Mot(
mot=row["ortho"],
lemme=row["lemme"],
cat_gram=cat_gram,
freq=float(row["freqlemlivres"]),
genre=genre,
)
)
out.sort(key=cls._find_word_key) # We need to bisect on this.
# Second pass: populate variants
for row in rows:
# Populate lemfreq
old_freq = lemfreq.get(row["ortho"], 0.0)
lemfreq[row["ortho"]] = max(
old_freq,
float(row["freqlemlivres"]),
float(row["freqlemfilms2"]),
)
lemme = cls._find_word(out, row)
if lemme is None:
continue
if lemme.cat_gram == CatGram.NOM:
nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre)
for nombre in nombres:
lemme.variantes[(nombre,)] = row["ortho"]
elif lemme.cat_gram == CatGram.VERBE:
infover = row["infover"].split(";")
for raw_ver in infover:
ver = raw_ver.split(":")
temps = None
personne = None
temps_select = ":".join(ver[0:2])
if temps_select not in Temps:
continue
temps = Temps(temps_select)
personne = cls.Parsers.verbe_personne.get(ver[2], None)
if personne is None:
continue # we're not interested in all conj. persons
lemme.variantes[(temps, personne)] = row["ortho"]
elif lemme.cat_gram == CatGram.ADJECTIF:
genres = match_enum_or_all(row["genre"], cls.Parsers.genre, Genre)
nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre)
for genre, nombre in itertools.product(genres, nombres):
lemme.variantes[(genre, nombre)] = row["ortho"]
# No need to match adverbs (invariant)
return cls(out, lemfreq)
def most_common(
self, cat_gram: CatGram, threshold: t.Optional[int] = None
) -> list[Mot]:
if threshold is None:
try:
threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram]
except KeyError as exn:
raise ValueError(
f"No threshold preset for grammatical category {cat_gram}, "
"please provide a threshold manually"
) from exn
out = list(filter(lambda word: word.cat_gram == cat_gram, self.dataset))
out.sort(key=lambda word: word.freq, reverse=True)
return out[:threshold]
def word_db(self, thresholds: t.Optional[dict[CatGram, int]] = None) -> WordDb:
"""Convert to a WordDb"""
thresholds = thresholds or {}
noms = self.most_common(CatGram.NOM, thresholds.get(CatGram.NOM, None))
db_noms = [
Nom(
genre=t.cast(Genre, nom.genre), # not None for noms
sing=nom.variantes[(Nombre.SING,)],
plur=nom.variantes[(Nombre.PLUR,)],
)
for nom in noms
]
adjectifs = self.most_common(
CatGram.ADJECTIF, thresholds.get(CatGram.ADJECTIF, None)
)
db_adjectifs = [
Adjectif(
masc_sing=adj.variantes[(Genre.MASC, Nombre.SING)],
masc_plur=adj.variantes[(Genre.MASC, Nombre.PLUR)],
fem_sing=adj.variantes[(Genre.FEM, Nombre.SING)],
fem_plur=adj.variantes[(Genre.FEM, Nombre.PLUR)],
)
for adj in adjectifs
]
verbes = self.most_common(CatGram.VERBE, thresholds.get(CatGram.VERBE, None))
db_verbes = [
Verbe(
present_sing=verbe.variantes[(Temps.PRESENT, Nombre.SING)],
present_plur=verbe.variantes[(Temps.PRESENT, Nombre.PLUR)],
futur_sing=verbe.variantes[(Temps.FUTUR, Nombre.SING)],
futur_plur=verbe.variantes[(Temps.FUTUR, Nombre.PLUR)],
imparfait_sing=verbe.variantes[(Temps.IMPARFAIT, Nombre.SING)],
imparfait_plur=verbe.variantes[(Temps.IMPARFAIT, Nombre.PLUR)],
)
for verbe in verbes
]
adverbes = self.most_common(
CatGram.ADVERBE, thresholds.get(CatGram.ADVERBE, None)
)
db_adverbes = [Adverbe(adv=adv.mot) for adv in adverbes]
return WordDb(
noms=db_noms, adjectifs=db_adjectifs, verbes=db_verbes, adverbes=db_adverbes
)