import csv import itertools from dataclasses import dataclass, field import logging import subprocess import typing as t from bisect import bisect_left import enum from pathlib import Path from .word_db import Genre, Nombre, Temps, Nom, Adjectif, Verbe, Adverbe, WordDb logger = logging.getLogger(__name__) class CatGram(enum.Enum): NOM = "NOM" VERBE = "VER" ADJECTIF = "ADJ" ADVERBE = "ADV" AUXILIAIRE = "AUX" ARTICLE = "ART" CONJONCTION = "CON" LIAISON = "LIA" PREPOSITION = "PRE" PRONOM = "PRO" ONOMATOPEE = "ONO" @classmethod def parse(cls, val: str) -> "CatGram": """Parses a 'catgram' entry""" base = val.split(":", maxsplit=1)[0] return cls(base) def __lt__(self, oth): return self.value < oth.value def match_enum_or_all(val: str, enum_mapper, enum_cls) -> list: """The value of the enum corresponding if any; else, all terms of the enum""" if val in enum_mapper: return [enum_mapper[val]] return list(enum_cls) @dataclass class Mot: mot: str lemme: str cat_gram: CatGram freq: float # occurrences of the canonical form by million words variantes: dict[tuple, str] = field(default_factory=dict) genre: t.Optional[Genre] = None class Lexique: LEXIQUE_DIR_PATH = Path(__file__).parent.parent / "data/raw/Lexique383" LEXIQUE_PATH = LEXIQUE_DIR_PATH / "Lexique383.tsv" PRESET_THRESHOLD_BY_CAT: dict[CatGram, int] = { CatGram.NOM: 10000, CatGram.VERBE: 10000, CatGram.ADJECTIF: 10000, CatGram.ADVERBE: 10000, } class Parsers: """Datatables to help parse the original data""" genre: dict[str, Genre] = { "m": Genre.MASC, "f": Genre.FEM, } rev_genre: dict[t.Optional[Genre], str] = { None: "", Genre.MASC: "m", Genre.FEM: "f", } nombre: dict[str, Nombre] = { "s": Nombre.SING, "p": Nombre.PLUR, } verbe_temps: dict[str, Temps] = { "ind:pre": Temps.PRESENT, "ind:fut": Temps.FUTUR, "ind:imp": Temps.IMPARFAIT, } verbe_personne: dict[str, Nombre] = { "3s": Nombre.SING, "3p": Nombre.PLUR, } dataset: list[Mot] lemfreq: dict[str, float] def __init__(self, dataset, lemfreq): self.dataset = dataset self.lemfreq = lemfreq @classmethod def _ensure_uncompressed(cls): """Ensures the dataset is uncompressed""" if cls.LEXIQUE_DIR_PATH.exists(): return lexique_archive = cls.LEXIQUE_DIR_PATH.with_suffix(".tar.xz") if not lexique_archive.exists(): logging.error("Missing compressed dataset at %s", lexique_archive) raise Exception(f"Missing compressed dataset at {lexique_archive}") logging.info("Uncompressing dataset") subprocess.check_call( [ "tar", "-xJf", lexique_archive.as_posix(), "-C", lexique_archive.parent.as_posix(), ] ) if not cls.LEXIQUE_DIR_PATH.exists(): logging.error( "Uncompressed dataset still missing at %s after extraction", cls.LEXIQUE_DIR_PATH, ) raise Exception( f"Uncompressed dataset still missing at {cls.LEXIQUE_DIR_PATH} after extraction" ) @classmethod def _find_word_key(cls, mot: Mot): return (mot.lemme, mot.cat_gram, cls.Parsers.rev_genre[mot.genre]) @classmethod def _find_word(cls, dataset: list[Mot], row: dict) -> t.Optional[Mot]: str_lemme = row["lemme"] cat_gram = CatGram.parse(row["cgram"]) genre = row["genre"] if cat_gram == CatGram.NOM else "" row_key = ( str_lemme, cat_gram, genre, ) lemme_pos = bisect_left( dataset, row_key, key=cls._find_word_key, ) if lemme_pos >= len(dataset): return None out = dataset[lemme_pos] if row_key != cls._find_word_key(out): return None return dataset[lemme_pos] @classmethod def parse(cls) -> "Lexique": out = [] rows = [] lemfreq: dict[str, float] = {} with cls.LEXIQUE_PATH.open("r") as h: reader = csv.DictReader(h, dialect="excel-tab") for row in reader: if not row["cgram"]: continue rows.append(row) # First pass: generate canonical forms (lemmes) for row in rows: cat_gram = CatGram.parse(row["cgram"]) if (row["lemme"] != row["ortho"]) and not ( cat_gram == CatGram.NOM and row["genre"] == "f" and row["nombre"] == "s" ): # Un nom singulier féminin est considéré comme forme canonique continue genre: t.Optional[Genre] = None if cat_gram == CatGram.NOM: genre = cls.Parsers.genre.get(row["genre"], None) out.append( Mot( mot=row["ortho"], lemme=row["lemme"], cat_gram=cat_gram, freq=float(row["freqlemlivres"]), genre=genre, ) ) out.sort(key=cls._find_word_key) # We need to bisect on this. # Second pass: populate variants for row in rows: # Populate lemfreq old_freq = lemfreq.get(row["ortho"], 0.0) lemfreq[row["ortho"]] = max( old_freq, float(row["freqlemlivres"]), float(row["freqlemfilms2"]), ) lemme = cls._find_word(out, row) if lemme is None: continue if lemme.cat_gram == CatGram.NOM: nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre) for nombre in nombres: lemme.variantes[(nombre,)] = row["ortho"] elif lemme.cat_gram == CatGram.VERBE: infover = row["infover"].split(";") for raw_ver in infover: ver = raw_ver.split(":") temps = None personne = None temps_select = ":".join(ver[0:2]) if temps_select not in Temps: continue temps = Temps(temps_select) personne = cls.Parsers.verbe_personne.get(ver[2], None) if personne is None: continue # we're not interested in all conj. persons lemme.variantes[(temps, personne)] = row["ortho"] elif lemme.cat_gram == CatGram.ADJECTIF: genres = match_enum_or_all(row["genre"], cls.Parsers.genre, Genre) nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre) for genre, nombre in itertools.product(genres, nombres): lemme.variantes[(genre, nombre)] = row["ortho"] # No need to match adverbs (invariant) return cls(out, lemfreq) def most_common( self, cat_gram: CatGram, threshold: t.Optional[int] = None ) -> list[Mot]: if threshold is None: try: threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram] except KeyError as exn: raise ValueError( f"No threshold preset for grammatical category {cat_gram}, " "please provide a threshold manually" ) from exn out = list(filter(lambda word: word.cat_gram == cat_gram, self.dataset)) out.sort(key=lambda word: word.freq, reverse=True) return out[:threshold] def word_db(self, thresholds: t.Optional[dict[CatGram, int]] = None) -> WordDb: """Convert to a WordDb""" thresholds = thresholds or {} noms = self.most_common(CatGram.NOM, thresholds.get(CatGram.NOM, None)) db_noms = [ Nom( genre=t.cast(Genre, nom.genre), # not None for noms sing=nom.variantes[(Nombre.SING,)], plur=nom.variantes[(Nombre.PLUR,)], ) for nom in noms ] adjectifs = self.most_common( CatGram.ADJECTIF, thresholds.get(CatGram.ADJECTIF, None) ) db_adjectifs = [ Adjectif( masc_sing=adj.variantes[(Genre.MASC, Nombre.SING)], masc_plur=adj.variantes[(Genre.MASC, Nombre.PLUR)], fem_sing=adj.variantes[(Genre.FEM, Nombre.SING)], fem_plur=adj.variantes[(Genre.FEM, Nombre.PLUR)], ) for adj in adjectifs ] verbes = self.most_common(CatGram.VERBE, thresholds.get(CatGram.VERBE, None)) db_verbes = [ Verbe( present_sing=verbe.variantes[(Temps.PRESENT, Nombre.SING)], present_plur=verbe.variantes[(Temps.PRESENT, Nombre.PLUR)], futur_sing=verbe.variantes[(Temps.FUTUR, Nombre.SING)], futur_plur=verbe.variantes[(Temps.FUTUR, Nombre.PLUR)], imparfait_sing=verbe.variantes[(Temps.IMPARFAIT, Nombre.SING)], imparfait_plur=verbe.variantes[(Temps.IMPARFAIT, Nombre.PLUR)], ) for verbe in verbes ] adverbes = self.most_common( CatGram.ADVERBE, thresholds.get(CatGram.ADVERBE, None) ) db_adverbes = [Adverbe(adv=adv.mot) for adv in adverbes] return WordDb( noms=db_noms, adjectifs=db_adjectifs, verbes=db_verbes, adverbes=db_adverbes )