pwgen-fr/pwgen_fr/lexique.py

import csv
import itertools
from dataclasses import dataclass, field
import logging
import subprocess
import typing as t
from bisect import bisect_left
import enum
from pathlib import Path
from .word_db import Genre, Nombre, Temps, Nom, Adjectif, Verbe, Adverbe, WordDb

logger = logging.getLogger(__name__)


class CatGram(enum.Enum):
    NOM = "NOM"
    VERBE = "VER"
    ADJECTIF = "ADJ"
    ADVERBE = "ADV"
    AUXILIAIRE = "AUX"
    ARTICLE = "ART"
    CONJONCTION = "CON"
    LIAISON = "LIA"
    PREPOSITION = "PRE"
    PRONOM = "PRO"
    ONOMATOPEE = "ONO"

    @classmethod
    def parse(cls, val: str) -> "CatGram":
        """Parses a 'catgram' entry"""
        base = val.split(":", maxsplit=1)[0]
        return cls(base)

    def __lt__(self, oth):
        return self.value < oth.value


def match_enum_or_all(val: str, enum_mapper, enum_cls) -> list:
    """The value of the enum corresponding if any; else, all terms of the enum"""
    if val in enum_mapper:
        return [enum_mapper[val]]
    return list(enum_cls)


@dataclass
class Mot:
    mot: str
    lemme: str
    cat_gram: CatGram
    freq: float  # occurrences of the canonical form by million words
    variantes: dict[tuple, str] = field(default_factory=dict)
    genre: t.Optional[Genre] = None


class Lexique:
    LEXIQUE_DIR_PATH = Path(__file__).parent.parent / "data/raw/Lexique383"
    LEXIQUE_PATH = LEXIQUE_DIR_PATH / "Lexique383.tsv"

    PRESET_THRESHOLD_BY_CAT: dict[CatGram, int] = {
        CatGram.NOM: 10000,
        CatGram.VERBE: 10000,
        CatGram.ADJECTIF: 10000,
        CatGram.ADVERBE: 10000,
    }

    class Parsers:
        """Datatables to help parse the original data"""

        genre: dict[str, Genre] = {
            "m": Genre.MASC,
            "f": Genre.FEM,
        }
        rev_genre: dict[t.Optional[Genre], str] = {
            None: "",
            Genre.MASC: "m",
            Genre.FEM: "f",
        }
        nombre: dict[str, Nombre] = {
            "s": Nombre.SING,
            "p": Nombre.PLUR,
        }
        verbe_temps: dict[str, Temps] = {
            "ind:pre": Temps.PRESENT,
            "ind:fut": Temps.FUTUR,
            "ind:imp": Temps.IMPARFAIT,
        }
        verbe_personne: dict[str, Nombre] = {
            "3s": Nombre.SING,
            "3p": Nombre.PLUR,
        }

    dataset: list[Mot]

    def __init__(self, dataset):
        self.dataset = dataset

    @classmethod
    def _ensure_uncompressed(cls):
        """Ensures the dataset is uncompressed"""
        if cls.LEXIQUE_DIR_PATH.exists():
            return

        lexique_archive = cls.LEXIQUE_DIR_PATH.with_suffix(".tar.xz")
        if not lexique_archive.exists():
            logging.error("Missing compressed dataset at %s", lexique_archive)
            raise Exception(f"Missing compressed dataset at {lexique_archive}")

        logging.info("Uncompressing dataset")
        subprocess.check_call(
            [
                "tar",
                "-xJf",
                lexique_archive.as_posix(),
                "-C",
                lexique_archive.parent.as_posix(),
            ]
        )

        if not cls.LEXIQUE_DIR_PATH.exists():
            logging.error(
                "Uncompressed dataset still missing at %s after extraction",
                cls.LEXIQUE_DIR_PATH,
            )
            raise Exception(
                f"Uncompressed dataset still missing at {cls.LEXIQUE_DIR_PATH} after extraction"
            )

    @classmethod
    def _find_word_key(cls, mot: Mot):
        return (mot.lemme, mot.cat_gram, cls.Parsers.rev_genre[mot.genre])

    @classmethod
    def _find_word(cls, dataset: list[Mot], row: dict) -> t.Optional[Mot]:
        str_lemme = row["lemme"]
        cat_gram = CatGram.parse(row["cgram"])
        genre = row["genre"] if cat_gram == CatGram.NOM else ""
        row_key = (
            str_lemme,
            cat_gram,
            genre,
        )
        lemme_pos = bisect_left(
            dataset,
            row_key,
            key=cls._find_word_key,
        )
        if lemme_pos >= len(dataset):
            return None
        out = dataset[lemme_pos]
        if row_key != cls._find_word_key(out):
            return None
        return dataset[lemme_pos]

    @classmethod
    def parse(cls) -> "Lexique":
        out = []
        rows = []
        with cls.LEXIQUE_PATH.open("r") as h:
            reader = csv.DictReader(h, dialect="excel-tab")
            for row in reader:
                if not row["cgram"]:
                    continue
                rows.append(row)

        # First pass: generate canonical forms (lemmes)
        for row in rows:
            cat_gram = CatGram.parse(row["cgram"])
            if (row["lemme"] != row["ortho"]) and not (
                cat_gram == CatGram.NOM and row["genre"] == "f" and row["nombre"] == "s"
            ):
                # Un nom singulier féminin est considéré comme forme canonique
                continue

            genre: t.Optional[Genre] = None
            if cat_gram == CatGram.NOM:
                genre = cls.Parsers.genre.get(row["genre"], None)
            out.append(
                Mot(
                    mot=row["ortho"],
                    lemme=row["lemme"],
                    cat_gram=cat_gram,
                    freq=float(row["freqlemlivres"]),
                    genre=genre,
                )
            )

        out.sort(key=cls._find_word_key)  # We need to bisect on this.

        # Second pass: populate variants
        for row in rows:
            lemme = cls._find_word(out, row)
            if lemme is None:
                continue

            if lemme.cat_gram == CatGram.NOM:
                nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre)
                for nombre in nombres:
                    lemme.variantes[(nombre,)] = row["ortho"]

            elif lemme.cat_gram == CatGram.VERBE:
                infover = row["infover"].split(";")
                for raw_ver in infover:
                    ver = raw_ver.split(":")

                    temps = None
                    personne = None
                    temps_select = ":".join(ver[0:2])
                    if temps_select not in Temps:
                        continue
                    temps = Temps(temps_select)
                    personne = cls.Parsers.verbe_personne.get(ver[2], None)
                    if personne is None:
                        continue  # we're not interested in all conj. persons

                    lemme.variantes[(temps, personne)] = row["ortho"]

            elif lemme.cat_gram == CatGram.ADJECTIF:
                genres = match_enum_or_all(row["genre"], cls.Parsers.genre, Genre)
                nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre)
                for genre, nombre in itertools.product(genres, nombres):
                    lemme.variantes[(genre, nombre)] = row["ortho"]

            # No need to match adverbs (invariant)
        return cls(out)

    def most_common(
        self, cat_gram: CatGram, threshold: t.Optional[int] = None
    ) -> list[Mot]:
        if threshold is None:
            try:
                threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram]
            except KeyError as exn:
                raise ValueError(
                    f"No threshold preset for grammatical category {cat_gram}, "
                    "please provide a threshold manually"
                ) from exn
        out = list(filter(lambda word: word.cat_gram == cat_gram, self.dataset))
        out.sort(key=lambda word: word.freq, reverse=True)
        return out[:threshold]

    def word_db(self, thresholds: t.Optional[dict[CatGram, int]] = None) -> WordDb:
        """Convert to a WordDb"""
        thresholds = thresholds or {}

        noms = self.most_common(CatGram.NOM, thresholds.get(CatGram.NOM, None))
        db_noms = [
            Nom(
                genre=t.cast(Genre, nom.genre),  # not None for noms
                sing=nom.variantes[(Nombre.SING,)],
                plur=nom.variantes[(Nombre.PLUR,)],
            )
            for nom in noms
        ]

        adjectifs = self.most_common(
            CatGram.ADJECTIF, thresholds.get(CatGram.ADJECTIF, None)
        )
        db_adjectifs = [
            Adjectif(
                masc_sing=adj.variantes[(Genre.MASC, Nombre.SING)],
                masc_plur=adj.variantes[(Genre.MASC, Nombre.PLUR)],
                fem_sing=adj.variantes[(Genre.FEM, Nombre.SING)],
                fem_plur=adj.variantes[(Genre.FEM, Nombre.PLUR)],
            )
            for adj in adjectifs
        ]

        verbes = self.most_common(CatGram.VERBE, thresholds.get(CatGram.VERBE, None))
        db_verbes = [
            Verbe(
                present_sing=verbe.variantes[(Temps.PRESENT, Nombre.SING)],
                present_plur=verbe.variantes[(Temps.PRESENT, Nombre.PLUR)],
                futur_sing=verbe.variantes[(Temps.FUTUR, Nombre.SING)],
                futur_plur=verbe.variantes[(Temps.FUTUR, Nombre.PLUR)],
                imparfait_sing=verbe.variantes[(Temps.IMPARFAIT, Nombre.SING)],
                imparfait_plur=verbe.variantes[(Temps.IMPARFAIT, Nombre.PLUR)],
            )
            for verbe in verbes
        ]

        adverbes = self.most_common(
            CatGram.ADVERBE, thresholds.get(CatGram.ADVERBE, None)
        )
        db_adverbes = [Adverbe(adv=adv.mot) for adv in adverbes]

        return WordDb(
            noms=db_noms, adjectifs=db_adjectifs, verbes=db_verbes, adverbes=db_adverbes
        )