Some dataset parsing, some tentative generation

This commit is contained in:
Théophile Bastian 2024-08-14 16:50:46 +02:00
parent d48ce95406
commit d3fbd47037
4 changed files with 192 additions and 0 deletions

37
pwgen_fr/generate.py Normal file
View file

@ -0,0 +1,37 @@
import secrets
from . import lexique
lex = lexique.Lexique.parse()
def gen_phrase4():
out = []
out.append(secrets.choice(lex.most_common(lexique.CatGram.ADJECTIF)))
out.append(secrets.choice(lex.most_common(lexique.CatGram.NOM)))
out.append(secrets.choice(lex.most_common(lexique.CatGram.VERBE)))
out.append(secrets.choice(lex.most_common(lexique.CatGram.NOM)))
return " ".join(map(lambda x: x.word, out))
def gen_rand(n=4):
out = []
for _ in range(n):
cat = secrets.choice(
(
lexique.CatGram.ADJECTIF,
lexique.CatGram.NOM,
lexique.CatGram.VERBE,
lexique.CatGram.ADVERBE,
)
)
out.append(secrets.choice(lex.most_common(cat)))
return " ".join(map(lambda x: x.word, out))
def gen_nom(n=4):
out = []
for _ in range(n):
cat = lexique.CatGram.NOM
out.append(secrets.choice(lex.most_common(cat)))
return " ".join(map(lambda x: x.word, out))

122
pwgen_fr/lexique.py Normal file
View file

@ -0,0 +1,122 @@
import csv
import logging
import subprocess
import typing as t
import enum
from pathlib import Path
logger = logging.getLogger(__name__)
class CatGram(enum.Enum):
NOM = "NOM"
VERBE = "VER"
ADJECTIF = "ADJ"
ADVERBE = "ADV"
AUXILIAIRE = "AUX"
ARTICLE = "ART"
CONJONCTION = "CON"
LIAISON = "LIA"
PREPOSITION = "PRE"
PRONOM = "PRO"
ONOMATOPEE = "ONO"
@classmethod
def parse(cls, val: str) -> "CatGram":
"""Parses a 'catgram' entry"""
base = val.split(":", maxsplit=1)[0]
return cls(base)
class Word(t.NamedTuple):
word: str
lemme: str # canonical form
cat_gram: CatGram
freq_lem: float # occurrences of the canonical form, in films, by million words
freq: float # occurrences of this exact form, in films, by million words
class Lexique:
LEXIQUE_DIR_PATH = Path(__file__).parent.parent / "data/raw/Lexique383"
LEXIQUE_PATH = LEXIQUE_DIR_PATH / "Lexique383.tsv"
PRESET_THRESHOLD_BY_CAT: dict[CatGram, int] = {
CatGram.NOM: 10000,
CatGram.VERBE: 10000,
CatGram.ADJECTIF: 10000,
CatGram.ADVERBE: 10000,
}
dataset: list[Word]
def __init__(self, dataset):
self.dataset = dataset
@classmethod
def _ensure_uncompressed(cls):
"""Ensures the dataset is uncompressed"""
if cls.LEXIQUE_DIR_PATH.exists():
return
lexique_archive = cls.LEXIQUE_DIR_PATH.with_suffix(".tar.xz")
if not lexique_archive.exists():
logging.error("Missing compressed dataset at %s", lexique_archive)
raise Exception(f"Missing compressed dataset at {lexique_archive}")
logging.info("Uncompressing dataset")
subprocess.check_call(
[
"tar",
"-xJf",
lexique_archive.as_posix(),
"-C",
lexique_archive.parent.as_posix(),
]
)
if not cls.LEXIQUE_DIR_PATH.exists():
logging.error(
"Uncompressed dataset still missing at %s after extraction",
cls.LEXIQUE_DIR_PATH,
)
raise Exception(
f"Uncompressed dataset still missing at {cls.LEXIQUE_DIR_PATH} after extraction"
)
@classmethod
def parse(cls) -> "Lexique":
out = []
with cls.LEXIQUE_PATH.open("r") as h:
reader = csv.DictReader(h, dialect="excel-tab")
for row in reader:
if not row["cgram"]:
continue
try:
out.append(
Word(
word=row["ortho"],
lemme=row["lemme"],
cat_gram=CatGram.parse(row["cgram"]),
freq_lem=float(row["freqlemlivres"]),
freq=float(row["freqlivres"]),
)
)
except ValueError as exn:
print(row)
raise exn from exn
return cls(out)
def most_common(
self, cat_gram: CatGram, threshold: t.Optional[int] = None
) -> list[Word]:
if threshold is None:
try:
threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram]
except KeyError as exn:
raise ValueError(
f"No threshold preset for grammatical category {cat_gram}, "
"please provide a threshold manually"
) from exn
out = list(filter(lambda word: word.cat_gram == cat_gram, self.dataset))
out.sort(key=lambda word: word.freq, reverse=True)
return out[:threshold]

0
requirements.txt Normal file
View file

33
setup.py Normal file
View file

@ -0,0 +1,33 @@
#!/usr/bin/env python3
from setuptools import setup, find_packages
def parse_requirements():
reqs = []
with open("requirements.txt", "r") as handle:
for line in handle:
reqs.append(line)
return reqs
setup(
name="pwgen_fr",
version="0.1.0",
description="Générateur de mots de passes forts basés sur des mots français, et les listes de mots associées",
author="tobast",
author_email="contact@tobast.fr",
license="LICENSE",
url="https://git.tobast.fr/tobast/pwgen-fr/",
packages=find_packages(),
include_package_data=True,
long_description=open("README.md").read(),
install_requires=parse_requirements(),
entry_points={
"console_scripts": [
# (
# "proxmox-snapshot-review = proxmox_scripts.snapshots:review_snapshots",
# ),
]
},
)