Some dataset parsing, some tentative generation

2024-08-14 16:50:46 +02:00 · 2024-08-14 16:50:46 +02:00 · d3fbd47037
commit d3fbd47037
parent d48ce95406
4 changed files with 192 additions and 0 deletions
--- a/pwgen_fr/generate.py
+++ b/pwgen_fr/generate.py
@ -0,0 +1,37 @@
 import secrets
 from . import lexique
 lex = lexique.Lexique.parse()
 def gen_phrase4():
    out = []
    out.append(secrets.choice(lex.most_common(lexique.CatGram.ADJECTIF)))
    out.append(secrets.choice(lex.most_common(lexique.CatGram.NOM)))
    out.append(secrets.choice(lex.most_common(lexique.CatGram.VERBE)))
    out.append(secrets.choice(lex.most_common(lexique.CatGram.NOM)))
    return " ".join(map(lambda x: x.word, out))
 def gen_rand(n=4):
    out = []
    for _ in range(n):
        cat = secrets.choice(
            (
                lexique.CatGram.ADJECTIF,
                lexique.CatGram.NOM,
                lexique.CatGram.VERBE,
                lexique.CatGram.ADVERBE,
            )
        )
        out.append(secrets.choice(lex.most_common(cat)))
    return " ".join(map(lambda x: x.word, out))
 def gen_nom(n=4):
    out = []
    for _ in range(n):
        cat = lexique.CatGram.NOM
        out.append(secrets.choice(lex.most_common(cat)))
    return " ".join(map(lambda x: x.word, out))
--- a/pwgen_fr/lexique.py
+++ b/pwgen_fr/lexique.py
@ -0,0 +1,122 @@
 import csv
 import logging
 import subprocess
 import typing as t
 import enum
 from pathlib import Path
 logger = logging.getLogger(__name__)
 class CatGram(enum.Enum):
    NOM = "NOM"
    VERBE = "VER"
    ADJECTIF = "ADJ"
    ADVERBE = "ADV"
    AUXILIAIRE = "AUX"
    ARTICLE = "ART"
    CONJONCTION = "CON"
    LIAISON = "LIA"
    PREPOSITION = "PRE"
    PRONOM = "PRO"
    ONOMATOPEE = "ONO"
    @classmethod
    def parse(cls, val: str) -> "CatGram":
        """Parses a 'catgram' entry"""
        base = val.split(":", maxsplit=1)[0]
        return cls(base)
 class Word(t.NamedTuple):
    word: str
    lemme: str  # canonical form
    cat_gram: CatGram
    freq_lem: float  # occurrences of the canonical form, in films, by million words
    freq: float  # occurrences of this exact form, in films, by million words
 class Lexique:
    LEXIQUE_DIR_PATH = Path(__file__).parent.parent / "data/raw/Lexique383"
    LEXIQUE_PATH = LEXIQUE_DIR_PATH / "Lexique383.tsv"
    PRESET_THRESHOLD_BY_CAT: dict[CatGram, int] = {
        CatGram.NOM: 10000,
        CatGram.VERBE: 10000,
        CatGram.ADJECTIF: 10000,
        CatGram.ADVERBE: 10000,
    }
    dataset: list[Word]
    def __init__(self, dataset):
        self.dataset = dataset
    @classmethod
    def _ensure_uncompressed(cls):
        """Ensures the dataset is uncompressed"""
        if cls.LEXIQUE_DIR_PATH.exists():
            return
        lexique_archive = cls.LEXIQUE_DIR_PATH.with_suffix(".tar.xz")
        if not lexique_archive.exists():
            logging.error("Missing compressed dataset at %s", lexique_archive)
            raise Exception(f"Missing compressed dataset at {lexique_archive}")
        logging.info("Uncompressing dataset")
        subprocess.check_call(
            [
                "tar",
                "-xJf",
                lexique_archive.as_posix(),
                "-C",
                lexique_archive.parent.as_posix(),
            ]
        )
        if not cls.LEXIQUE_DIR_PATH.exists():
            logging.error(
                "Uncompressed dataset still missing at %s after extraction",
                cls.LEXIQUE_DIR_PATH,
            )
            raise Exception(
                f"Uncompressed dataset still missing at {cls.LEXIQUE_DIR_PATH} after extraction"
            )
    @classmethod
    def parse(cls) -> "Lexique":
        out = []
        with cls.LEXIQUE_PATH.open("r") as h:
            reader = csv.DictReader(h, dialect="excel-tab")
            for row in reader:
                if not row["cgram"]:
                    continue
                try:
                    out.append(
                        Word(
                            word=row["ortho"],
                            lemme=row["lemme"],
                            cat_gram=CatGram.parse(row["cgram"]),
                            freq_lem=float(row["freqlemlivres"]),
                            freq=float(row["freqlivres"]),
                        )
                    )
                except ValueError as exn:
                    print(row)
                    raise exn from exn
        return cls(out)
    def most_common(
        self, cat_gram: CatGram, threshold: t.Optional[int] = None
    ) -> list[Word]:
        if threshold is None:
            try:
                threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram]
            except KeyError as exn:
                raise ValueError(
                    f"No threshold preset for grammatical category {cat_gram}, "
                    "please provide a threshold manually"
                ) from exn
        out = list(filter(lambda word: word.cat_gram == cat_gram, self.dataset))
        out.sort(key=lambda word: word.freq, reverse=True)
        return out[:threshold]
--- a/requirements.txt
+++ b/requirements.txt
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,33 @@
 #!/usr/bin/env python3
 from setuptools import setup, find_packages
 def parse_requirements():
    reqs = []
    with open("requirements.txt", "r") as handle:
        for line in handle:
            reqs.append(line)
    return reqs
 setup(
    name="pwgen_fr",
    version="0.1.0",
    description="Générateur de mots de passes forts basés sur des mots français, et les listes de mots associées",
    author="tobast",
    author_email="contact@tobast.fr",
    license="LICENSE",
    url="https://git.tobast.fr/tobast/pwgen-fr/",
    packages=find_packages(),
    include_package_data=True,
    long_description=open("README.md").read(),
    install_requires=parse_requirements(),
    entry_points={
        "console_scripts": [
            #            (
            #                "proxmox-snapshot-review = proxmox_scripts.snapshots:review_snapshots",
            #            ),
        ]
    },
 )