From d3fbd470373e57728e3551d83e9fd0c9a708ff2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= <contact@tobast.fr>
Date: Wed, 14 Aug 2024 16:50:46 +0200
Subject: [PATCH] Some dataset parsing, some tentative generation

---
 pwgen_fr/generate.py |  37 +++++++++++++
 pwgen_fr/lexique.py  | 122 +++++++++++++++++++++++++++++++++++++++++++
 requirements.txt     |   0
 setup.py             |  33 ++++++++++++
 4 files changed, 192 insertions(+)
 create mode 100644 pwgen_fr/generate.py
 create mode 100644 pwgen_fr/lexique.py
 create mode 100644 requirements.txt
 create mode 100644 setup.py

diff --git a/pwgen_fr/generate.py b/pwgen_fr/generate.py
new file mode 100644
index 0000000..677853b
--- /dev/null
+++ b/pwgen_fr/generate.py
@@ -0,0 +1,37 @@
+import secrets
+
+from . import lexique
+
+lex = lexique.Lexique.parse()
+
+
+def gen_phrase4():
+    out = []
+    out.append(secrets.choice(lex.most_common(lexique.CatGram.ADJECTIF)))
+    out.append(secrets.choice(lex.most_common(lexique.CatGram.NOM)))
+    out.append(secrets.choice(lex.most_common(lexique.CatGram.VERBE)))
+    out.append(secrets.choice(lex.most_common(lexique.CatGram.NOM)))
+    return " ".join(map(lambda x: x.word, out))
+
+
+def gen_rand(n=4):
+    out = []
+    for _ in range(n):
+        cat = secrets.choice(
+            (
+                lexique.CatGram.ADJECTIF,
+                lexique.CatGram.NOM,
+                lexique.CatGram.VERBE,
+                lexique.CatGram.ADVERBE,
+            )
+        )
+        out.append(secrets.choice(lex.most_common(cat)))
+    return " ".join(map(lambda x: x.word, out))
+
+
+def gen_nom(n=4):
+    out = []
+    for _ in range(n):
+        cat = lexique.CatGram.NOM
+        out.append(secrets.choice(lex.most_common(cat)))
+    return " ".join(map(lambda x: x.word, out))
diff --git a/pwgen_fr/lexique.py b/pwgen_fr/lexique.py
new file mode 100644
index 0000000..dc5a578
--- /dev/null
+++ b/pwgen_fr/lexique.py
@@ -0,0 +1,122 @@
+import csv
+import logging
+import subprocess
+import typing as t
+import enum
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+class CatGram(enum.Enum):
+    NOM = "NOM"
+    VERBE = "VER"
+    ADJECTIF = "ADJ"
+    ADVERBE = "ADV"
+    AUXILIAIRE = "AUX"
+    ARTICLE = "ART"
+    CONJONCTION = "CON"
+    LIAISON = "LIA"
+    PREPOSITION = "PRE"
+    PRONOM = "PRO"
+    ONOMATOPEE = "ONO"
+
+    @classmethod
+    def parse(cls, val: str) -> "CatGram":
+        """Parses a 'catgram' entry"""
+        base = val.split(":", maxsplit=1)[0]
+        return cls(base)
+
+
+class Word(t.NamedTuple):
+    word: str
+    lemme: str  # canonical form
+    cat_gram: CatGram
+    freq_lem: float  # occurrences of the canonical form, in films, by million words
+    freq: float  # occurrences of this exact form, in films, by million words
+
+
+class Lexique:
+    LEXIQUE_DIR_PATH = Path(__file__).parent.parent / "data/raw/Lexique383"
+    LEXIQUE_PATH = LEXIQUE_DIR_PATH / "Lexique383.tsv"
+
+    PRESET_THRESHOLD_BY_CAT: dict[CatGram, int] = {
+        CatGram.NOM: 10000,
+        CatGram.VERBE: 10000,
+        CatGram.ADJECTIF: 10000,
+        CatGram.ADVERBE: 10000,
+    }
+
+    dataset: list[Word]
+
+    def __init__(self, dataset):
+        self.dataset = dataset
+
+    @classmethod
+    def _ensure_uncompressed(cls):
+        """Ensures the dataset is uncompressed"""
+        if cls.LEXIQUE_DIR_PATH.exists():
+            return
+
+        lexique_archive = cls.LEXIQUE_DIR_PATH.with_suffix(".tar.xz")
+        if not lexique_archive.exists():
+            logging.error("Missing compressed dataset at %s", lexique_archive)
+            raise Exception(f"Missing compressed dataset at {lexique_archive}")
+
+        logging.info("Uncompressing dataset")
+        subprocess.check_call(
+            [
+                "tar",
+                "-xJf",
+                lexique_archive.as_posix(),
+                "-C",
+                lexique_archive.parent.as_posix(),
+            ]
+        )
+
+        if not cls.LEXIQUE_DIR_PATH.exists():
+            logging.error(
+                "Uncompressed dataset still missing at %s after extraction",
+                cls.LEXIQUE_DIR_PATH,
+            )
+            raise Exception(
+                f"Uncompressed dataset still missing at {cls.LEXIQUE_DIR_PATH} after extraction"
+            )
+
+    @classmethod
+    def parse(cls) -> "Lexique":
+        out = []
+        with cls.LEXIQUE_PATH.open("r") as h:
+            reader = csv.DictReader(h, dialect="excel-tab")
+            for row in reader:
+                if not row["cgram"]:
+                    continue
+                try:
+                    out.append(
+                        Word(
+                            word=row["ortho"],
+                            lemme=row["lemme"],
+                            cat_gram=CatGram.parse(row["cgram"]),
+                            freq_lem=float(row["freqlemlivres"]),
+                            freq=float(row["freqlivres"]),
+                        )
+                    )
+                except ValueError as exn:
+                    print(row)
+                    raise exn from exn
+        return cls(out)
+
+    def most_common(
+        self, cat_gram: CatGram, threshold: t.Optional[int] = None
+    ) -> list[Word]:
+        if threshold is None:
+            try:
+                threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram]
+            except KeyError as exn:
+                raise ValueError(
+                    f"No threshold preset for grammatical category {cat_gram}, "
+                    "please provide a threshold manually"
+                ) from exn
+        out = list(filter(lambda word: word.cat_gram == cat_gram, self.dataset))
+        out.sort(key=lambda word: word.freq, reverse=True)
+        return out[:threshold]
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e69de29
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..5d28848
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+
+from setuptools import setup, find_packages
+
+
+def parse_requirements():
+    reqs = []
+    with open("requirements.txt", "r") as handle:
+        for line in handle:
+            reqs.append(line)
+    return reqs
+
+
+setup(
+    name="pwgen_fr",
+    version="0.1.0",
+    description="Générateur de mots de passes forts basés sur des mots français, et les listes de mots associées",
+    author="tobast",
+    author_email="contact@tobast.fr",
+    license="LICENSE",
+    url="https://git.tobast.fr/tobast/pwgen-fr/",
+    packages=find_packages(),
+    include_package_data=True,
+    long_description=open("README.md").read(),
+    install_requires=parse_requirements(),
+    entry_points={
+        "console_scripts": [
+            #            (
+            #                "proxmox-snapshot-review = proxmox_scripts.snapshots:review_snapshots",
+            #            ),
+        ]
+    },
+)