From d3fbd470373e57728e3551d83e9fd0c9a708ff2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Wed, 14 Aug 2024 16:50:46 +0200 Subject: [PATCH] Some dataset parsing, some tentative generation --- pwgen_fr/generate.py | 37 +++++++++++++ pwgen_fr/lexique.py | 122 +++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 0 setup.py | 33 ++++++++++++ 4 files changed, 192 insertions(+) create mode 100644 pwgen_fr/generate.py create mode 100644 pwgen_fr/lexique.py create mode 100644 requirements.txt create mode 100644 setup.py diff --git a/pwgen_fr/generate.py b/pwgen_fr/generate.py new file mode 100644 index 0000000..677853b --- /dev/null +++ b/pwgen_fr/generate.py @@ -0,0 +1,37 @@ +import secrets + +from . import lexique + +lex = lexique.Lexique.parse() + + +def gen_phrase4(): + out = [] + out.append(secrets.choice(lex.most_common(lexique.CatGram.ADJECTIF))) + out.append(secrets.choice(lex.most_common(lexique.CatGram.NOM))) + out.append(secrets.choice(lex.most_common(lexique.CatGram.VERBE))) + out.append(secrets.choice(lex.most_common(lexique.CatGram.NOM))) + return " ".join(map(lambda x: x.word, out)) + + +def gen_rand(n=4): + out = [] + for _ in range(n): + cat = secrets.choice( + ( + lexique.CatGram.ADJECTIF, + lexique.CatGram.NOM, + lexique.CatGram.VERBE, + lexique.CatGram.ADVERBE, + ) + ) + out.append(secrets.choice(lex.most_common(cat))) + return " ".join(map(lambda x: x.word, out)) + + +def gen_nom(n=4): + out = [] + for _ in range(n): + cat = lexique.CatGram.NOM + out.append(secrets.choice(lex.most_common(cat))) + return " ".join(map(lambda x: x.word, out)) diff --git a/pwgen_fr/lexique.py b/pwgen_fr/lexique.py new file mode 100644 index 0000000..dc5a578 --- /dev/null +++ b/pwgen_fr/lexique.py @@ -0,0 +1,122 @@ +import csv +import logging +import subprocess +import typing as t +import enum +from pathlib import Path + +logger = logging.getLogger(__name__) + + +class CatGram(enum.Enum): + NOM = "NOM" + VERBE = "VER" + ADJECTIF = "ADJ" + ADVERBE = "ADV" + AUXILIAIRE = "AUX" + ARTICLE = "ART" + CONJONCTION = "CON" + LIAISON = "LIA" + PREPOSITION = "PRE" + PRONOM = "PRO" + ONOMATOPEE = "ONO" + + @classmethod + def parse(cls, val: str) -> "CatGram": + """Parses a 'catgram' entry""" + base = val.split(":", maxsplit=1)[0] + return cls(base) + + +class Word(t.NamedTuple): + word: str + lemme: str # canonical form + cat_gram: CatGram + freq_lem: float # occurrences of the canonical form, in films, by million words + freq: float # occurrences of this exact form, in films, by million words + + +class Lexique: + LEXIQUE_DIR_PATH = Path(__file__).parent.parent / "data/raw/Lexique383" + LEXIQUE_PATH = LEXIQUE_DIR_PATH / "Lexique383.tsv" + + PRESET_THRESHOLD_BY_CAT: dict[CatGram, int] = { + CatGram.NOM: 10000, + CatGram.VERBE: 10000, + CatGram.ADJECTIF: 10000, + CatGram.ADVERBE: 10000, + } + + dataset: list[Word] + + def __init__(self, dataset): + self.dataset = dataset + + @classmethod + def _ensure_uncompressed(cls): + """Ensures the dataset is uncompressed""" + if cls.LEXIQUE_DIR_PATH.exists(): + return + + lexique_archive = cls.LEXIQUE_DIR_PATH.with_suffix(".tar.xz") + if not lexique_archive.exists(): + logging.error("Missing compressed dataset at %s", lexique_archive) + raise Exception(f"Missing compressed dataset at {lexique_archive}") + + logging.info("Uncompressing dataset") + subprocess.check_call( + [ + "tar", + "-xJf", + lexique_archive.as_posix(), + "-C", + lexique_archive.parent.as_posix(), + ] + ) + + if not cls.LEXIQUE_DIR_PATH.exists(): + logging.error( + "Uncompressed dataset still missing at %s after extraction", + cls.LEXIQUE_DIR_PATH, + ) + raise Exception( + f"Uncompressed dataset still missing at {cls.LEXIQUE_DIR_PATH} after extraction" + ) + + @classmethod + def parse(cls) -> "Lexique": + out = [] + with cls.LEXIQUE_PATH.open("r") as h: + reader = csv.DictReader(h, dialect="excel-tab") + for row in reader: + if not row["cgram"]: + continue + try: + out.append( + Word( + word=row["ortho"], + lemme=row["lemme"], + cat_gram=CatGram.parse(row["cgram"]), + freq_lem=float(row["freqlemlivres"]), + freq=float(row["freqlivres"]), + ) + ) + except ValueError as exn: + print(row) + raise exn from exn + return cls(out) + + def most_common( + self, cat_gram: CatGram, threshold: t.Optional[int] = None + ) -> list[Word]: + if threshold is None: + try: + threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram] + except KeyError as exn: + raise ValueError( + f"No threshold preset for grammatical category {cat_gram}, " + "please provide a threshold manually" + ) from exn + out = list(filter(lambda word: word.cat_gram == cat_gram, self.dataset)) + out.sort(key=lambda word: word.freq, reverse=True) + return out[:threshold] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..5d28848 --- /dev/null +++ b/setup.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 + +from setuptools import setup, find_packages + + +def parse_requirements(): + reqs = [] + with open("requirements.txt", "r") as handle: + for line in handle: + reqs.append(line) + return reqs + + +setup( + name="pwgen_fr", + version="0.1.0", + description="Générateur de mots de passes forts basés sur des mots français, et les listes de mots associées", + author="tobast", + author_email="contact@tobast.fr", + license="LICENSE", + url="https://git.tobast.fr/tobast/pwgen-fr/", + packages=find_packages(), + include_package_data=True, + long_description=open("README.md").read(), + install_requires=parse_requirements(), + entry_points={ + "console_scripts": [ + # ( + # "proxmox-snapshot-review = proxmox_scripts.snapshots:review_snapshots", + # ), + ] + }, +)