300 lines
9.7 KiB
Python
300 lines
9.7 KiB
Python
import csv
|
|
import itertools
|
|
from dataclasses import dataclass, field
|
|
import logging
|
|
import subprocess
|
|
import typing as t
|
|
from bisect import bisect_left
|
|
import enum
|
|
from pathlib import Path
|
|
from .word_db import Genre, Nombre, Temps, Nom, Adjectif, Verbe, Adverbe, WordDb
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class CatGram(enum.Enum):
|
|
NOM = "NOM"
|
|
VERBE = "VER"
|
|
ADJECTIF = "ADJ"
|
|
ADVERBE = "ADV"
|
|
AUXILIAIRE = "AUX"
|
|
ARTICLE = "ART"
|
|
CONJONCTION = "CON"
|
|
LIAISON = "LIA"
|
|
PREPOSITION = "PRE"
|
|
PRONOM = "PRO"
|
|
ONOMATOPEE = "ONO"
|
|
|
|
@classmethod
|
|
def parse(cls, val: str) -> "CatGram":
|
|
"""Parses a 'catgram' entry"""
|
|
base = val.split(":", maxsplit=1)[0]
|
|
return cls(base)
|
|
|
|
def __lt__(self, oth):
|
|
return self.value < oth.value
|
|
|
|
|
|
def match_enum_or_all(val: str, enum_mapper, enum_cls) -> list:
|
|
"""The value of the enum corresponding if any; else, all terms of the enum"""
|
|
if val in enum_mapper:
|
|
return [enum_mapper[val]]
|
|
return list(enum_cls)
|
|
|
|
|
|
@dataclass
|
|
class Mot:
|
|
mot: str
|
|
lemme: str
|
|
cat_gram: CatGram
|
|
freq: float # occurrences of the canonical form by million words
|
|
variantes: dict[tuple, str] = field(default_factory=dict)
|
|
genre: t.Optional[Genre] = None
|
|
|
|
|
|
class Lexique:
|
|
LEXIQUE_DIR_PATH = Path(__file__).parent.parent / "data/raw/Lexique383"
|
|
LEXIQUE_PATH = LEXIQUE_DIR_PATH / "Lexique383.tsv"
|
|
|
|
PRESET_THRESHOLD_BY_CAT: dict[CatGram, int] = {
|
|
CatGram.NOM: 10000,
|
|
CatGram.VERBE: 10000,
|
|
CatGram.ADJECTIF: 10000,
|
|
CatGram.ADVERBE: 10000,
|
|
}
|
|
|
|
class Parsers:
|
|
"""Datatables to help parse the original data"""
|
|
|
|
genre: dict[str, Genre] = {
|
|
"m": Genre.MASC,
|
|
"f": Genre.FEM,
|
|
}
|
|
rev_genre: dict[t.Optional[Genre], str] = {
|
|
None: "",
|
|
Genre.MASC: "m",
|
|
Genre.FEM: "f",
|
|
}
|
|
nombre: dict[str, Nombre] = {
|
|
"s": Nombre.SING,
|
|
"p": Nombre.PLUR,
|
|
}
|
|
verbe_temps: dict[str, Temps] = {
|
|
"ind:pre": Temps.PRESENT,
|
|
"ind:fut": Temps.FUTUR,
|
|
"ind:imp": Temps.IMPARFAIT,
|
|
}
|
|
verbe_personne: dict[str, Nombre] = {
|
|
"3s": Nombre.SING,
|
|
"3p": Nombre.PLUR,
|
|
}
|
|
|
|
dataset: list[Mot]
|
|
lemfreq: dict[str, float]
|
|
|
|
def __init__(self, dataset, lemfreq):
|
|
self.dataset = dataset
|
|
self.lemfreq = lemfreq
|
|
|
|
@classmethod
|
|
def _ensure_uncompressed(cls):
|
|
"""Ensures the dataset is uncompressed"""
|
|
if cls.LEXIQUE_DIR_PATH.exists():
|
|
return
|
|
|
|
lexique_archive = cls.LEXIQUE_DIR_PATH.with_suffix(".tar.xz")
|
|
if not lexique_archive.exists():
|
|
logging.error("Missing compressed dataset at %s", lexique_archive)
|
|
raise Exception(f"Missing compressed dataset at {lexique_archive}")
|
|
|
|
logging.info("Uncompressing dataset")
|
|
subprocess.check_call(
|
|
[
|
|
"tar",
|
|
"-xJf",
|
|
lexique_archive.as_posix(),
|
|
"-C",
|
|
lexique_archive.parent.as_posix(),
|
|
]
|
|
)
|
|
|
|
if not cls.LEXIQUE_DIR_PATH.exists():
|
|
logging.error(
|
|
"Uncompressed dataset still missing at %s after extraction",
|
|
cls.LEXIQUE_DIR_PATH,
|
|
)
|
|
raise Exception(
|
|
f"Uncompressed dataset still missing at {cls.LEXIQUE_DIR_PATH} after extraction"
|
|
)
|
|
|
|
@classmethod
|
|
def _find_word_key(cls, mot: Mot):
|
|
return (mot.lemme, mot.cat_gram, cls.Parsers.rev_genre[mot.genre])
|
|
|
|
@classmethod
|
|
def _find_word(cls, dataset: list[Mot], row: dict) -> t.Optional[Mot]:
|
|
str_lemme = row["lemme"]
|
|
cat_gram = CatGram.parse(row["cgram"])
|
|
genre = row["genre"] if cat_gram == CatGram.NOM else ""
|
|
row_key = (
|
|
str_lemme,
|
|
cat_gram,
|
|
genre,
|
|
)
|
|
lemme_pos = bisect_left(
|
|
dataset,
|
|
row_key,
|
|
key=cls._find_word_key,
|
|
)
|
|
if lemme_pos >= len(dataset):
|
|
return None
|
|
out = dataset[lemme_pos]
|
|
if row_key != cls._find_word_key(out):
|
|
return None
|
|
return dataset[lemme_pos]
|
|
|
|
@classmethod
|
|
def parse(cls) -> "Lexique":
|
|
out = []
|
|
rows = []
|
|
lemfreq: dict[str, float] = {}
|
|
|
|
with cls.LEXIQUE_PATH.open("r") as h:
|
|
reader = csv.DictReader(h, dialect="excel-tab")
|
|
for row in reader:
|
|
if not row["cgram"]:
|
|
continue
|
|
rows.append(row)
|
|
|
|
# First pass: generate canonical forms (lemmes)
|
|
for row in rows:
|
|
cat_gram = CatGram.parse(row["cgram"])
|
|
if (row["lemme"] != row["ortho"]) and not (
|
|
cat_gram == CatGram.NOM and row["genre"] == "f" and row["nombre"] == "s"
|
|
):
|
|
# Un nom singulier féminin est considéré comme forme canonique
|
|
continue
|
|
|
|
genre: t.Optional[Genre] = None
|
|
if cat_gram == CatGram.NOM:
|
|
genre = cls.Parsers.genre.get(row["genre"], None)
|
|
out.append(
|
|
Mot(
|
|
mot=row["ortho"],
|
|
lemme=row["lemme"],
|
|
cat_gram=cat_gram,
|
|
freq=float(row["freqlemlivres"]),
|
|
genre=genre,
|
|
)
|
|
)
|
|
|
|
out.sort(key=cls._find_word_key) # We need to bisect on this.
|
|
|
|
# Second pass: populate variants
|
|
for row in rows:
|
|
# Populate lemfreq
|
|
old_freq = lemfreq.get(row["ortho"], 0.0)
|
|
lemfreq[row["ortho"]] = max(
|
|
old_freq,
|
|
float(row["freqlemlivres"]),
|
|
float(row["freqlemfilms2"]),
|
|
)
|
|
|
|
lemme = cls._find_word(out, row)
|
|
if lemme is None:
|
|
continue
|
|
|
|
if lemme.cat_gram == CatGram.NOM:
|
|
nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre)
|
|
for nombre in nombres:
|
|
lemme.variantes[(nombre,)] = row["ortho"]
|
|
|
|
elif lemme.cat_gram == CatGram.VERBE:
|
|
infover = row["infover"].split(";")
|
|
for raw_ver in infover:
|
|
ver = raw_ver.split(":")
|
|
|
|
temps = None
|
|
personne = None
|
|
temps_select = ":".join(ver[0:2])
|
|
if temps_select not in Temps:
|
|
continue
|
|
temps = Temps(temps_select)
|
|
personne = cls.Parsers.verbe_personne.get(ver[2], None)
|
|
if personne is None:
|
|
continue # we're not interested in all conj. persons
|
|
|
|
lemme.variantes[(temps, personne)] = row["ortho"]
|
|
|
|
elif lemme.cat_gram == CatGram.ADJECTIF:
|
|
genres = match_enum_or_all(row["genre"], cls.Parsers.genre, Genre)
|
|
nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre)
|
|
for genre, nombre in itertools.product(genres, nombres):
|
|
lemme.variantes[(genre, nombre)] = row["ortho"]
|
|
|
|
# No need to match adverbs (invariant)
|
|
return cls(out, lemfreq)
|
|
|
|
def most_common(
|
|
self, cat_gram: CatGram, threshold: t.Optional[int] = None
|
|
) -> list[Mot]:
|
|
if threshold is None:
|
|
try:
|
|
threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram]
|
|
except KeyError as exn:
|
|
raise ValueError(
|
|
f"No threshold preset for grammatical category {cat_gram}, "
|
|
"please provide a threshold manually"
|
|
) from exn
|
|
out = list(filter(lambda word: word.cat_gram == cat_gram, self.dataset))
|
|
out.sort(key=lambda word: word.freq, reverse=True)
|
|
return out[:threshold]
|
|
|
|
def word_db(self, thresholds: t.Optional[dict[CatGram, int]] = None) -> WordDb:
|
|
"""Convert to a WordDb"""
|
|
thresholds = thresholds or {}
|
|
|
|
noms = self.most_common(CatGram.NOM, thresholds.get(CatGram.NOM, None))
|
|
db_noms = [
|
|
Nom(
|
|
genre=t.cast(Genre, nom.genre), # not None for noms
|
|
sing=nom.variantes[(Nombre.SING,)],
|
|
plur=nom.variantes[(Nombre.PLUR,)],
|
|
)
|
|
for nom in noms
|
|
]
|
|
|
|
adjectifs = self.most_common(
|
|
CatGram.ADJECTIF, thresholds.get(CatGram.ADJECTIF, None)
|
|
)
|
|
db_adjectifs = [
|
|
Adjectif(
|
|
masc_sing=adj.variantes[(Genre.MASC, Nombre.SING)],
|
|
masc_plur=adj.variantes[(Genre.MASC, Nombre.PLUR)],
|
|
fem_sing=adj.variantes[(Genre.FEM, Nombre.SING)],
|
|
fem_plur=adj.variantes[(Genre.FEM, Nombre.PLUR)],
|
|
)
|
|
for adj in adjectifs
|
|
]
|
|
|
|
verbes = self.most_common(CatGram.VERBE, thresholds.get(CatGram.VERBE, None))
|
|
db_verbes = [
|
|
Verbe(
|
|
present_sing=verbe.variantes[(Temps.PRESENT, Nombre.SING)],
|
|
present_plur=verbe.variantes[(Temps.PRESENT, Nombre.PLUR)],
|
|
futur_sing=verbe.variantes[(Temps.FUTUR, Nombre.SING)],
|
|
futur_plur=verbe.variantes[(Temps.FUTUR, Nombre.PLUR)],
|
|
imparfait_sing=verbe.variantes[(Temps.IMPARFAIT, Nombre.SING)],
|
|
imparfait_plur=verbe.variantes[(Temps.IMPARFAIT, Nombre.PLUR)],
|
|
)
|
|
for verbe in verbes
|
|
]
|
|
|
|
adverbes = self.most_common(
|
|
CatGram.ADVERBE, thresholds.get(CatGram.ADVERBE, None)
|
|
)
|
|
db_adverbes = [Adverbe(adv=adv.mot) for adv in adverbes]
|
|
|
|
return WordDb(
|
|
noms=db_noms, adjectifs=db_adjectifs, verbes=db_verbes, adverbes=db_adverbes
|
|
)
|