Some dataset parsing, some tentative generation
This commit is contained in:
parent
d48ce95406
commit
d3fbd47037
4 changed files with 192 additions and 0 deletions
37
pwgen_fr/generate.py
Normal file
37
pwgen_fr/generate.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
import secrets
|
||||
|
||||
from . import lexique
|
||||
|
||||
lex = lexique.Lexique.parse()
|
||||
|
||||
|
||||
def gen_phrase4():
|
||||
out = []
|
||||
out.append(secrets.choice(lex.most_common(lexique.CatGram.ADJECTIF)))
|
||||
out.append(secrets.choice(lex.most_common(lexique.CatGram.NOM)))
|
||||
out.append(secrets.choice(lex.most_common(lexique.CatGram.VERBE)))
|
||||
out.append(secrets.choice(lex.most_common(lexique.CatGram.NOM)))
|
||||
return " ".join(map(lambda x: x.word, out))
|
||||
|
||||
|
||||
def gen_rand(n=4):
|
||||
out = []
|
||||
for _ in range(n):
|
||||
cat = secrets.choice(
|
||||
(
|
||||
lexique.CatGram.ADJECTIF,
|
||||
lexique.CatGram.NOM,
|
||||
lexique.CatGram.VERBE,
|
||||
lexique.CatGram.ADVERBE,
|
||||
)
|
||||
)
|
||||
out.append(secrets.choice(lex.most_common(cat)))
|
||||
return " ".join(map(lambda x: x.word, out))
|
||||
|
||||
|
||||
def gen_nom(n=4):
|
||||
out = []
|
||||
for _ in range(n):
|
||||
cat = lexique.CatGram.NOM
|
||||
out.append(secrets.choice(lex.most_common(cat)))
|
||||
return " ".join(map(lambda x: x.word, out))
|
122
pwgen_fr/lexique.py
Normal file
122
pwgen_fr/lexique.py
Normal file
|
@ -0,0 +1,122 @@
|
|||
import csv
|
||||
import logging
|
||||
import subprocess
|
||||
import typing as t
|
||||
import enum
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CatGram(enum.Enum):
|
||||
NOM = "NOM"
|
||||
VERBE = "VER"
|
||||
ADJECTIF = "ADJ"
|
||||
ADVERBE = "ADV"
|
||||
AUXILIAIRE = "AUX"
|
||||
ARTICLE = "ART"
|
||||
CONJONCTION = "CON"
|
||||
LIAISON = "LIA"
|
||||
PREPOSITION = "PRE"
|
||||
PRONOM = "PRO"
|
||||
ONOMATOPEE = "ONO"
|
||||
|
||||
@classmethod
|
||||
def parse(cls, val: str) -> "CatGram":
|
||||
"""Parses a 'catgram' entry"""
|
||||
base = val.split(":", maxsplit=1)[0]
|
||||
return cls(base)
|
||||
|
||||
|
||||
class Word(t.NamedTuple):
|
||||
word: str
|
||||
lemme: str # canonical form
|
||||
cat_gram: CatGram
|
||||
freq_lem: float # occurrences of the canonical form, in films, by million words
|
||||
freq: float # occurrences of this exact form, in films, by million words
|
||||
|
||||
|
||||
class Lexique:
|
||||
LEXIQUE_DIR_PATH = Path(__file__).parent.parent / "data/raw/Lexique383"
|
||||
LEXIQUE_PATH = LEXIQUE_DIR_PATH / "Lexique383.tsv"
|
||||
|
||||
PRESET_THRESHOLD_BY_CAT: dict[CatGram, int] = {
|
||||
CatGram.NOM: 10000,
|
||||
CatGram.VERBE: 10000,
|
||||
CatGram.ADJECTIF: 10000,
|
||||
CatGram.ADVERBE: 10000,
|
||||
}
|
||||
|
||||
dataset: list[Word]
|
||||
|
||||
def __init__(self, dataset):
|
||||
self.dataset = dataset
|
||||
|
||||
@classmethod
|
||||
def _ensure_uncompressed(cls):
|
||||
"""Ensures the dataset is uncompressed"""
|
||||
if cls.LEXIQUE_DIR_PATH.exists():
|
||||
return
|
||||
|
||||
lexique_archive = cls.LEXIQUE_DIR_PATH.with_suffix(".tar.xz")
|
||||
if not lexique_archive.exists():
|
||||
logging.error("Missing compressed dataset at %s", lexique_archive)
|
||||
raise Exception(f"Missing compressed dataset at {lexique_archive}")
|
||||
|
||||
logging.info("Uncompressing dataset")
|
||||
subprocess.check_call(
|
||||
[
|
||||
"tar",
|
||||
"-xJf",
|
||||
lexique_archive.as_posix(),
|
||||
"-C",
|
||||
lexique_archive.parent.as_posix(),
|
||||
]
|
||||
)
|
||||
|
||||
if not cls.LEXIQUE_DIR_PATH.exists():
|
||||
logging.error(
|
||||
"Uncompressed dataset still missing at %s after extraction",
|
||||
cls.LEXIQUE_DIR_PATH,
|
||||
)
|
||||
raise Exception(
|
||||
f"Uncompressed dataset still missing at {cls.LEXIQUE_DIR_PATH} after extraction"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def parse(cls) -> "Lexique":
|
||||
out = []
|
||||
with cls.LEXIQUE_PATH.open("r") as h:
|
||||
reader = csv.DictReader(h, dialect="excel-tab")
|
||||
for row in reader:
|
||||
if not row["cgram"]:
|
||||
continue
|
||||
try:
|
||||
out.append(
|
||||
Word(
|
||||
word=row["ortho"],
|
||||
lemme=row["lemme"],
|
||||
cat_gram=CatGram.parse(row["cgram"]),
|
||||
freq_lem=float(row["freqlemlivres"]),
|
||||
freq=float(row["freqlivres"]),
|
||||
)
|
||||
)
|
||||
except ValueError as exn:
|
||||
print(row)
|
||||
raise exn from exn
|
||||
return cls(out)
|
||||
|
||||
def most_common(
|
||||
self, cat_gram: CatGram, threshold: t.Optional[int] = None
|
||||
) -> list[Word]:
|
||||
if threshold is None:
|
||||
try:
|
||||
threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram]
|
||||
except KeyError as exn:
|
||||
raise ValueError(
|
||||
f"No threshold preset for grammatical category {cat_gram}, "
|
||||
"please provide a threshold manually"
|
||||
) from exn
|
||||
out = list(filter(lambda word: word.cat_gram == cat_gram, self.dataset))
|
||||
out.sort(key=lambda word: word.freq, reverse=True)
|
||||
return out[:threshold]
|
0
requirements.txt
Normal file
0
requirements.txt
Normal file
33
setup.py
Normal file
33
setup.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
|
||||
def parse_requirements():
|
||||
reqs = []
|
||||
with open("requirements.txt", "r") as handle:
|
||||
for line in handle:
|
||||
reqs.append(line)
|
||||
return reqs
|
||||
|
||||
|
||||
setup(
|
||||
name="pwgen_fr",
|
||||
version="0.1.0",
|
||||
description="Générateur de mots de passes forts basés sur des mots français, et les listes de mots associées",
|
||||
author="tobast",
|
||||
author_email="contact@tobast.fr",
|
||||
license="LICENSE",
|
||||
url="https://git.tobast.fr/tobast/pwgen-fr/",
|
||||
packages=find_packages(),
|
||||
include_package_data=True,
|
||||
long_description=open("README.md").read(),
|
||||
install_requires=parse_requirements(),
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
# (
|
||||
# "proxmox-snapshot-review = proxmox_scripts.snapshots:review_snapshots",
|
||||
# ),
|
||||
]
|
||||
},
|
||||
)
|
Loading…
Reference in a new issue