Compare commits

..

17 commits

Author SHA1 Message Date
16a72733a1 Update data README with Grammalecte 2025-02-14 21:33:11 +01:00
ef10649607 Factor out dataset uncompression 2025-02-14 21:33:11 +01:00
2420584b76 Add lxml requirement (needed for Morphalou) 2025-02-14 13:34:32 +01:00
64ae84b0cd Fixup rand6 being actually rand4 2024-12-10 21:49:09 +01:00
64a3a30ddc Add pwgen-fr entrypoint 2024-11-02 22:35:20 +01:00
4f152d45f2 Morphalou: frequency filter based on Lexique 2024-09-23 19:30:40 +02:00
a872ecb0f9 Morphalou: use new dataset location, uncompress 2024-09-19 21:06:08 +02:00
e8379656e1 Add morphalou dataset and related README 2024-09-19 20:58:48 +02:00
3c10d987e6 Generate: update to use word_db 2024-09-16 23:03:13 +02:00
62bb8076e9 word_db: add random-pick methods 2024-09-16 23:01:37 +02:00
695813d35f Fixup word_db unserialization 2024-09-16 22:41:45 +02:00
a9c3c90405 word_db: serialize correctly 2024-09-16 18:41:17 +02:00
b086b9a08d Parse morphalou as word_db 2024-09-16 18:41:17 +02:00
f3df51ae26 Lexique: work towards word_db
Still problems with other forms
2024-09-10 00:30:29 +02:00
cf455556be Add __init__ for mypy 2024-09-10 00:30:16 +02:00
ff4a71d1a7 Add word_db as a target 2024-09-10 00:30:03 +02:00
874329c982 Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00
15 changed files with 881 additions and 80 deletions

1
.gitignore vendored
View file

@ -162,3 +162,4 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
tmp

View file

@ -6,6 +6,5 @@ de mots associées.
## Sources de mots
Les mots et leurs propriétés (fréquences, catégories grammaticales, etc.) sont
tirés de la base [Lexique](http://lexique.org). Ces données, et les listes
dérivées, sont sous [licence CC
BY-NC](https://creativecommons.org/licenses/by-nc/4.0/).
tirés de diverses bases de données, sous diverses licences. Consultez le
fichier README dans `data/raw` à ce sujet.

2
data/raw/.gitignore vendored
View file

@ -1 +1,3 @@
Lexique383
Morphalou3.1_formatTEI
grammalecte

Binary file not shown.

42
data/raw/README.md Normal file
View file

@ -0,0 +1,42 @@
# Versions réduites de jeux de données
Les fichiers dans ce dossier sont des versions réduites de jeux de données
tiers.
Veillez à respecter les licences respectives de ces ressources dans les usages
que vous en faites.
## Lexique
La base de données Lexique (http://www.lexique.org/) est le travail, entre
autres contributeurs et contributrices, de Boris New et Christophe Pallier,
sous licence CC BY-NC
Le fichier présent ici est une version tronquée de la v3.83. Il ne conserve que
la partie utile au présent logiciel. Le jeu de données entier est disponible
sur leur site.
## Morphalou
La base de données Morphalou
(https://www.ortolang.fr/market/lexicons/morphalou/v3.1) est le travail, entre
autres contributeurs et contributrices, de Sandrine Ollinger, Christophe
Benzitoun, Evelyne Jacquey, Ulrike Fleury, Etienne Petitjean et Marie
Tonnelier. Sa version 3.1 est distribuée sous licence LGPL-LR.
Le fichier présent ici est une version tronquée de la v3.1. Il ne conserve que
la partie utile au présent logiciel. Le jeu de données entier est disponible
sur leur site.
## Grammalecte
La base de données de Grammalecte (https://grammalecte.net) donne des données
de fréquences de mots dans la langue française. Elle est le travail de Olivier
R. et d'autres contributeurs et contributrices. Grammalecte est publié sous
licence GNU GPL version 3.
Le fichier présent ici provient des commandes "fossil uv export …" trouvables
sur [cette
page](http://grammalecte.net:8080/wiki?name=How+to+clone+this+repository).
Seuls les fichiers qui nous sont utiles ici sont conservés, et sont compressés
par xz.

0
pwgen_fr/__init__.py Normal file
View file

22
pwgen_fr/entrypoints.py Normal file
View file

@ -0,0 +1,22 @@
import argparse
import typing as t
from . import generate
def pwgen_fr():
choices_map: dict[str, t.Callable[[], str]] = {
"phrase4": generate.gen_phrase4,
"phrase6": generate.gen_phrase6,
"rand4": lambda: generate.gen_rand(n=4),
"rand6": lambda: generate.gen_rand(n=6),
}
parser = argparse.ArgumentParser()
parser.add_argument(
"mode", choices=choices_map.keys(), help="Select the generation procedure used"
)
args = parser.parse_args()
print(choices_map[args.mode]())

View file

@ -1,37 +1,72 @@
import secrets
from . import lexique
from . import word_db
lex = lexique.Lexique.parse()
wdb = word_db.WordDb.autoload()
def gen_phrase4():
out = []
out.append(secrets.choice(lex.most_common(lexique.CatGram.ADJECTIF)))
out.append(secrets.choice(lex.most_common(lexique.CatGram.NOM)))
out.append(secrets.choice(lex.most_common(lexique.CatGram.VERBE)))
out.append(secrets.choice(lex.most_common(lexique.CatGram.NOM)))
return " ".join(map(lambda x: x.word, out))
def gen_phrase4() -> str:
"""Generates a sentence with four words, of structure Adjective Noun Verb Adverb"""
nombre = word_db.Nombre.pick()
temps = word_db.Temps.pick()
adj = secrets.choice(wdb.adjectifs)
nom = secrets.choice(wdb.noms)
verbe = secrets.choice(wdb.verbes)
adverbe = secrets.choice(wdb.adverbes)
return " ".join(
[
adj.accord(nom.genre_or_pick, nombre),
nom.accord(nombre),
verbe.accord(temps, nombre),
adverbe.accord(),
]
)
def gen_rand(n=4):
def gen_phrase6() -> str:
"""Generates a sentence with six words, of structure Adjective Noun Verb Adjective
Noun Adverb"""
nombres = [word_db.Nombre.pick() for _ in range(2)]
temps = word_db.Temps.pick()
adj0 = secrets.choice(wdb.adjectifs)
nom0 = secrets.choice(wdb.noms)
verbe = secrets.choice(wdb.verbes)
adj1 = secrets.choice(wdb.adjectifs)
nom1 = secrets.choice(wdb.noms)
adverbe = secrets.choice(wdb.adverbes)
return " ".join(
[
adj0.accord(nom0.genre_or_pick, nombres[0]),
nom0.accord(nombres[0]),
verbe.accord(temps, nombres[0]),
adj1.accord(nom1.genre_or_pick, nombres[1]),
nom1.accord(nombres[1]),
adverbe.accord(),
]
)
def gen_rand(n=4) -> str:
"""Generates a fully random sequence of n words, without grammatical consistency"""
out = []
for _ in range(n):
cat = secrets.choice(
(
lexique.CatGram.ADJECTIF,
lexique.CatGram.NOM,
lexique.CatGram.VERBE,
lexique.CatGram.ADVERBE,
)
)
out.append(secrets.choice(lex.most_common(cat)))
return " ".join(map(lambda x: x.word, out))
word_cat = secrets.choice(list(wdb.CATEGORY_TO_ATTR))
if word_cat == word_db.Nom:
nombre = word_db.Nombre.pick()
out.append(secrets.choice(wdb.noms).accord(nombre))
elif word_cat == word_db.Adjectif:
genre = word_db.Genre.pick()
nombre = word_db.Nombre.pick()
out.append(secrets.choice(wdb.adjectifs).accord(genre, nombre))
elif word_cat == word_db.Verbe:
temps = word_db.Temps.pick()
nombre = word_db.Nombre.pick()
out.append(secrets.choice(wdb.verbes).accord(temps, nombre))
elif word_cat == word_db.Adverbe:
out.append(secrets.choice(wdb.adverbes).accord())
def gen_nom(n=4):
out = []
for _ in range(n):
cat = lexique.CatGram.NOM
out.append(secrets.choice(lex.most_common(cat)))
return " ".join(map(lambda x: x.word, out))
return " ".join(out)

View file

@ -1,9 +1,13 @@
import csv
import itertools
from dataclasses import dataclass, field
import logging
import subprocess
import typing as t
from bisect import bisect_left
import enum
from pathlib import Path
from .word_db import Genre, Nombre, Temps, Nom, Adjectif, Verbe, Adverbe, WordDb
from . import util
logger = logging.getLogger(__name__)
@ -27,13 +31,25 @@ class CatGram(enum.Enum):
base = val.split(":", maxsplit=1)[0]
return cls(base)
def __lt__(self, oth):
return self.value < oth.value
class Word(t.NamedTuple):
word: str
lemme: str # canonical form
def match_enum_or_all(val: str, enum_mapper, enum_cls) -> list:
"""The value of the enum corresponding if any; else, all terms of the enum"""
if val in enum_mapper:
return [enum_mapper[val]]
return list(enum_cls)
@dataclass
class Mot:
mot: str
lemme: str
cat_gram: CatGram
freq_lem: float # occurrences of the canonical form, in films, by million words
freq: float # occurrences of this exact form, in films, by million words
freq: float # occurrences of the canonical form by million words
variantes: dict[tuple, str] = field(default_factory=dict)
genre: t.Optional[Genre] = None
class Lexique:
@ -47,68 +63,155 @@ class Lexique:
CatGram.ADVERBE: 10000,
}
dataset: list[Word]
class Parsers:
"""Datatables to help parse the original data"""
def __init__(self, dataset):
genre: dict[str, Genre] = {
"m": Genre.MASC,
"f": Genre.FEM,
}
rev_genre: dict[t.Optional[Genre], str] = {
None: "",
Genre.MASC: "m",
Genre.FEM: "f",
}
nombre: dict[str, Nombre] = {
"s": Nombre.SING,
"p": Nombre.PLUR,
}
verbe_temps: dict[str, Temps] = {
"ind:pre": Temps.PRESENT,
"ind:fut": Temps.FUTUR,
"ind:imp": Temps.IMPARFAIT,
}
verbe_personne: dict[str, Nombre] = {
"3s": Nombre.SING,
"3p": Nombre.PLUR,
}
dataset: list[Mot]
lemfreq: dict[str, float]
def __init__(self, dataset, lemfreq):
self.dataset = dataset
self.lemfreq = lemfreq
@classmethod
def _ensure_uncompressed(cls):
"""Ensures the dataset is uncompressed"""
if cls.LEXIQUE_DIR_PATH.exists():
return
util.ensure_dataset_uncompressed(cls.LEXIQUE_DIR_PATH, "Lexique")
lexique_archive = cls.LEXIQUE_DIR_PATH.with_suffix(".tar.xz")
if not lexique_archive.exists():
logging.error("Missing compressed dataset at %s", lexique_archive)
raise Exception(f"Missing compressed dataset at {lexique_archive}")
@classmethod
def _find_word_key(cls, mot: Mot):
return (mot.lemme, mot.cat_gram, cls.Parsers.rev_genre[mot.genre])
logging.info("Uncompressing dataset")
subprocess.check_call(
[
"tar",
"-xJf",
lexique_archive.as_posix(),
"-C",
lexique_archive.parent.as_posix(),
]
@classmethod
def _find_word(cls, dataset: list[Mot], row: dict) -> t.Optional[Mot]:
str_lemme = row["lemme"]
cat_gram = CatGram.parse(row["cgram"])
genre = row["genre"] if cat_gram == CatGram.NOM else ""
row_key = (
str_lemme,
cat_gram,
genre,
)
if not cls.LEXIQUE_DIR_PATH.exists():
logging.error(
"Uncompressed dataset still missing at %s after extraction",
cls.LEXIQUE_DIR_PATH,
)
raise Exception(
f"Uncompressed dataset still missing at {cls.LEXIQUE_DIR_PATH} after extraction"
)
lemme_pos = bisect_left(
dataset,
row_key,
key=cls._find_word_key,
)
if lemme_pos >= len(dataset):
return None
out = dataset[lemme_pos]
if row_key != cls._find_word_key(out):
return None
return dataset[lemme_pos]
@classmethod
def parse(cls) -> "Lexique":
out = []
rows = []
lemfreq: dict[str, float] = {}
with cls.LEXIQUE_PATH.open("r") as h:
reader = csv.DictReader(h, dialect="excel-tab")
for row in reader:
if not row["cgram"]:
continue
try:
out.append(
Word(
word=row["ortho"],
lemme=row["lemme"],
cat_gram=CatGram.parse(row["cgram"]),
freq_lem=float(row["freqlemlivres"]),
freq=float(row["freqlivres"]),
)
)
except ValueError as exn:
print(row)
raise exn from exn
return cls(out)
rows.append(row)
# First pass: generate canonical forms (lemmes)
for row in rows:
cat_gram = CatGram.parse(row["cgram"])
if (row["lemme"] != row["ortho"]) and not (
cat_gram == CatGram.NOM and row["genre"] == "f" and row["nombre"] == "s"
):
# Un nom singulier féminin est considéré comme forme canonique
continue
genre: t.Optional[Genre] = None
if cat_gram == CatGram.NOM:
genre = cls.Parsers.genre.get(row["genre"], None)
out.append(
Mot(
mot=row["ortho"],
lemme=row["lemme"],
cat_gram=cat_gram,
freq=float(row["freqlemlivres"]),
genre=genre,
)
)
out.sort(key=cls._find_word_key) # We need to bisect on this.
# Second pass: populate variants
for row in rows:
# Populate lemfreq
old_freq = lemfreq.get(row["ortho"], 0.0)
lemfreq[row["ortho"]] = max(
old_freq,
float(row["freqlemlivres"]),
float(row["freqlemfilms2"]),
)
lemme = cls._find_word(out, row)
if lemme is None:
continue
if lemme.cat_gram == CatGram.NOM:
nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre)
for nombre in nombres:
lemme.variantes[(nombre,)] = row["ortho"]
elif lemme.cat_gram == CatGram.VERBE:
infover = row["infover"].split(";")
for raw_ver in infover:
ver = raw_ver.split(":")
temps = None
personne = None
temps_select = ":".join(ver[0:2])
if temps_select not in Temps:
continue
temps = Temps(temps_select)
personne = cls.Parsers.verbe_personne.get(ver[2], None)
if personne is None:
continue # we're not interested in all conj. persons
lemme.variantes[(temps, personne)] = row["ortho"]
elif lemme.cat_gram == CatGram.ADJECTIF:
genres = match_enum_or_all(row["genre"], cls.Parsers.genre, Genre)
nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre)
for genre, nombre in itertools.product(genres, nombres):
lemme.variantes[(genre, nombre)] = row["ortho"]
# No need to match adverbs (invariant)
return cls(out, lemfreq)
def most_common(
self, cat_gram: CatGram, threshold: t.Optional[int] = None
) -> list[Word]:
) -> list[Mot]:
if threshold is None:
try:
threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram]
@ -120,3 +223,52 @@ class Lexique:
out = list(filter(lambda word: word.cat_gram == cat_gram, self.dataset))
out.sort(key=lambda word: word.freq, reverse=True)
return out[:threshold]
def word_db(self, thresholds: t.Optional[dict[CatGram, int]] = None) -> WordDb:
"""Convert to a WordDb"""
thresholds = thresholds or {}
noms = self.most_common(CatGram.NOM, thresholds.get(CatGram.NOM, None))
db_noms = [
Nom(
genre=t.cast(Genre, nom.genre), # not None for noms
sing=nom.variantes[(Nombre.SING,)],
plur=nom.variantes[(Nombre.PLUR,)],
)
for nom in noms
]
adjectifs = self.most_common(
CatGram.ADJECTIF, thresholds.get(CatGram.ADJECTIF, None)
)
db_adjectifs = [
Adjectif(
masc_sing=adj.variantes[(Genre.MASC, Nombre.SING)],
masc_plur=adj.variantes[(Genre.MASC, Nombre.PLUR)],
fem_sing=adj.variantes[(Genre.FEM, Nombre.SING)],
fem_plur=adj.variantes[(Genre.FEM, Nombre.PLUR)],
)
for adj in adjectifs
]
verbes = self.most_common(CatGram.VERBE, thresholds.get(CatGram.VERBE, None))
db_verbes = [
Verbe(
present_sing=verbe.variantes[(Temps.PRESENT, Nombre.SING)],
present_plur=verbe.variantes[(Temps.PRESENT, Nombre.PLUR)],
futur_sing=verbe.variantes[(Temps.FUTUR, Nombre.SING)],
futur_plur=verbe.variantes[(Temps.FUTUR, Nombre.PLUR)],
imparfait_sing=verbe.variantes[(Temps.IMPARFAIT, Nombre.SING)],
imparfait_plur=verbe.variantes[(Temps.IMPARFAIT, Nombre.PLUR)],
)
for verbe in verbes
]
adverbes = self.most_common(
CatGram.ADVERBE, thresholds.get(CatGram.ADVERBE, None)
)
db_adverbes = [Adverbe(adv=adv.mot) for adv in adverbes]
return WordDb(
noms=db_noms, adjectifs=db_adjectifs, verbes=db_verbes, adverbes=db_adverbes
)

230
pwgen_fr/morphalou.py Normal file
View file

@ -0,0 +1,230 @@
"""Reads the Morphalou dataset, in its TSV form"""
import itertools
import logging
import typing as t
from pathlib import Path
from lxml import etree
from . import util
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
TSV_NS = {
"tsv": "http://www.tei-c.org/ns/1.0",
"xml": "http://www.w3.org/XML/1998/namespace",
}
logger = logging.getLogger(__name__)
class MorphalouSet:
MORPHALOU_DIR_PATH = (
Path(__file__).parent.parent / "data/raw/Morphalou3.1_formatTEI"
)
MORPHALOU_FILENAME_TEMPLATE = "{cat_name}_Morphalou3.1_TEI.xml"
CAT_MAPPING: dict[t.Type[t.NamedTuple], str] = {
Nom: "commonNoun",
Adjectif: "adjective",
Verbe: "verb",
Adverbe: "adverb",
}
word_db: WordDb
def __init__(self):
self.word_db = WordDb()
@classmethod
def _ensure_uncompressed(cls):
"""Ensures the dataset is uncompressed"""
util.ensure_dataset_uncompressed(cls.MORPHALOU_DIR_PATH, "Morphalou")
def parse(self):
"""Parses the dataset"""
self.__class__._ensure_uncompressed()
for cat, cat_file in self.__class__.CAT_MAPPING.items():
word_db_elt = WordDb.CATEGORY_TO_ATTR[cat]
logging.info("Parsing %s...", word_db_elt)
setattr(
self.word_db,
word_db_elt,
getattr(self, f"_parse_{word_db_elt}")(
self.__class__.MORPHALOU_DIR_PATH
/ self.__class__.MORPHALOU_FILENAME_TEMPLATE.format(
cat_name=cat_file
)
),
)
def _tsv_elems(self, tsv_path: Path):
"""Opens a TSV file, and returns the <body> node, direct parent of all the
relevant nodes"""
with tsv_path.open("r") as h:
tree = etree.parse(h)
root = tree.getroot()
body = root.find("./tsv:text/tsv:body", TSV_NS)
return body
def _parse_noms(self, tsv_path: Path) -> list[Nom]:
"""Parse the nouns"""
root = self._tsv_elems(tsv_path)
out: list[Nom] = []
for entry in root.iterfind("./tsv:entry", TSV_NS):
try:
genre = self._genre(
entry.find(
"./tsv:form[@type='lemma']/tsv:gramGrp/tsv:gen", TSV_NS
).text
)
except AttributeError:
continue # some nouns don't have a gender defined, somehow -- ignore
forms = {}
for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
orth = inflected.find("./tsv:orth", TSV_NS).text
nombres = self._nombre_set(
inflected.find("./tsv:gramGrp/tsv:number", TSV_NS).text
)
for form in nombres:
forms[form] = orth
try:
out.append(
Nom(
genre=genre,
sing=forms[Nombre.SING],
plur=forms[Nombre.PLUR],
)
)
except KeyError:
continue # cannot be inflected to all required forms: skip
return out
def _parse_adjectifs(self, tsv_path: Path) -> list[Adjectif]:
"""Parse the adjectives"""
root = self._tsv_elems(tsv_path)
out: list[Adjectif] = []
for entry in root.iterfind("./tsv:entry", TSV_NS):
forms = {}
for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
orth = inflected.find("./tsv:orth", TSV_NS).text
gram_grp = inflected.find("./tsv:gramGrp", TSV_NS)
genres = self._genre_set(gram_grp.find("./tsv:gen", TSV_NS).text)
nombres = self._nombre_set(gram_grp.find("./tsv:number", TSV_NS).text)
for form in itertools.product(genres, nombres):
forms[form] = orth
try:
out.append(
Adjectif(
masc_sing=forms[Genre.MASC, Nombre.SING],
masc_plur=forms[Genre.MASC, Nombre.PLUR],
fem_sing=forms[Genre.FEM, Nombre.SING],
fem_plur=forms[Genre.FEM, Nombre.PLUR],
)
)
except KeyError:
continue # cannot be inflected to all required forms: skip
return out
def _parse_verbes(self, tsv_path: Path) -> list[Verbe]:
"""Parse the verbs"""
root = self._tsv_elems(tsv_path)
out: list[Verbe] = []
for entry in root.iterfind("./tsv:entry", TSV_NS):
forms = {}
for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
gram_grp = inflected.find("./tsv:gramGrp", TSV_NS)
# Order of tests is important! If mood == 'participle', there is no
# 'person' defined.
if (
gram_grp.find("./tsv:mood", TSV_NS).text != "indicative"
or gram_grp.find("./tsv:per", TSV_NS).text != "thirdPerson"
):
continue # irrelevant for us
temps = self._tense(gram_grp.find("./tsv:tns", TSV_NS).text)
if temps is None:
continue # irrelevant for us
nombres = self._nombre_set(gram_grp.find("./tsv:number", TSV_NS).text)
orth = inflected.find("./tsv:orth", TSV_NS).text
for nombre in nombres:
forms[(temps, nombre)] = orth
try:
out.append(
Verbe(
present_sing=forms[Temps.PRESENT, Nombre.SING],
present_plur=forms[Temps.PRESENT, Nombre.PLUR],
futur_sing=forms[Temps.FUTUR, Nombre.SING],
futur_plur=forms[Temps.FUTUR, Nombre.PLUR],
imparfait_sing=forms[Temps.IMPARFAIT, Nombre.SING],
imparfait_plur=forms[Temps.IMPARFAIT, Nombre.PLUR],
)
)
except KeyError:
continue # cannot be inflected to all required forms: skip
return out
def _parse_adverbes(self, tsv_path: Path) -> list[Adverbe]:
"""Parse the adverbs"""
root = self._tsv_elems(tsv_path)
out: list[Adverbe] = []
for entry in root.iterfind("./tsv:entry", TSV_NS):
# We're only interested in the lemma form
orth = entry.find("./tsv:form[@type='lemma']/tsv:orth", TSV_NS)
assert orth is not None
adv = orth.text
out.append(Adverbe(adv=adv))
return out
@staticmethod
def _genre_set(genre: str) -> list[Genre]:
return {
"masculine": [Genre.MASC],
"feminine": [Genre.FEM],
"invariable": [Genre.MASC, Genre.FEM],
}[genre]
@staticmethod
def _genre(genre: str) -> Genre:
return {
"masculine": Genre.MASC,
"feminine": Genre.FEM,
"invariable": Genre.INV,
}[genre]
@staticmethod
def _nombre(nombre: str) -> Nombre:
return {
"singular": Nombre.SING,
"plural": Nombre.PLUR,
}[nombre]
@staticmethod
def _nombre_set(nombre: str) -> list[Nombre]:
return {
"singular": [Nombre.SING],
"plural": [Nombre.PLUR],
"invariable": [Nombre.SING, Nombre.PLUR],
}[nombre]
@staticmethod
def _tense(tense: str) -> t.Optional[Temps]:
return {
"present": Temps.PRESENT,
"imperfect": Temps.IMPARFAIT,
"future": Temps.FUTUR,
}.get(tense, None)

View file

@ -0,0 +1,74 @@
""" Generates a worddb based on Morphalou, but limits to frequent words based on
external sources (eg Lexique) """
import logging
import typing as t
from . import lexique, morphalou
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
logger = logging.getLogger(__name__)
class MorphalouFreqSet:
morphalou_db: WordDb
lexique: lexique.Lexique
filtered_db: WordDb
def __init__(
self,
morphalou_db: t.Optional[WordDb] = None,
lexique: t.Optional[lexique.Lexique] = None,
):
if not morphalou_db:
morphalou_set = morphalou.MorphalouSet()
morphalou_set.parse()
self.morphalou_db = morphalou_set.word_db
else:
self.morphalou_db = morphalou_db
if not lexique:
self.lexique = lexique.Lexique.parse()
else:
self.lexique = lexique
self.filtered_db = self._filter_lexique()
def _filter_nom(self, nom: Nom) -> bool:
freq = max(
self.lexique.lemfreq.get(nom.sing, 0.0),
self.lexique.lemfreq.get(nom.plur, 0.0),
)
return freq > 0
def _filter_adjectif(self, adjectif: Adjectif) -> bool:
freq = max(
self.lexique.lemfreq.get(adjectif.masc_sing, 0.0),
self.lexique.lemfreq.get(adjectif.fem_sing, 0.0),
)
return freq > 0
def _filter_verbe(self, verbe: Verbe) -> bool:
freq = max(
self.lexique.lemfreq.get(verbe.present_sing, 0.0),
self.lexique.lemfreq.get(verbe.futur_sing, 0.0),
self.lexique.lemfreq.get(verbe.imparfait_sing, 0.0),
self.lexique.lemfreq.get(verbe.present_plur, 0.0),
self.lexique.lemfreq.get(verbe.futur_plur, 0.0),
self.lexique.lemfreq.get(verbe.imparfait_plur, 0.0),
)
return freq > 0
def _filter_adverbe(self, adverbe: Adverbe) -> bool:
if " " in adverbe.adv:
return False
freq = self.lexique.lemfreq.get(adverbe.adv, 0.0)
return freq > 0
def _filter_lexique(self) -> WordDb:
out = WordDb()
out.noms = list(filter(self._filter_nom, self.morphalou_db.noms))
out.adjectifs = list(filter(self._filter_adjectif, self.morphalou_db.adjectifs))
out.verbes = list(filter(self._filter_verbe, self.morphalou_db.verbes))
out.adverbes = list(filter(self._filter_adverbe, self.morphalou_db.adverbes))
return out

39
pwgen_fr/util.py Normal file
View file

@ -0,0 +1,39 @@
"""Various utility functions used throughout the codebase"""
import subprocess
from pathlib import Path
import logging
def ensure_dataset_uncompressed(dir_path: Path, name: str):
"""Ensures the directory dir_path exists; if not, attempts to uncompress the name
suffixed by .tar.xz."""
if dir_path.exists():
return
archive_path = dir_path.with_suffix(dir_path.suffix + ".tar.xz")
if not archive_path.exists():
logging.error("Missing %s compressed dataset at %s", name, archive_path)
raise Exception(f"Missing {name} compressed dataset at {archive_path}")
logging.info("Uncompressing %s dataset", name)
subprocess.check_call(
[
"tar",
"-xJf",
archive_path.as_posix(),
"-C",
archive_path.parent.as_posix(),
]
)
if not archive_path.exists():
logging.error(
"Uncompressed %s dataset still missing at %s after extraction",
name,
dir_path,
)
raise Exception(
f"Uncompressed {name} dataset still missing at {dir_path} after extraction"
)

206
pwgen_fr/word_db.py Normal file
View file

@ -0,0 +1,206 @@
""" A pre-processed database of words, independant of their source """
import gzip
import json
import secrets
import typing as t
from enum import Enum
from pathlib import Path
class Genre(Enum):
MASC = "masculin"
FEM = "féminin"
INV = "invariable" # pour les noms uniquement
@classmethod
def pick(cls) -> "Genre":
"""random-pick (avoids inv)"""
return secrets.choice([cls.MASC, cls.FEM])
class Nombre(Enum):
SING = "singulier"
PLUR = "pluriel"
@classmethod
def pick(cls) -> "Nombre":
"""random-pick"""
return secrets.choice(list(cls))
class Temps(Enum):
PRESENT = "present"
FUTUR = "futur"
IMPARFAIT = "imparfait"
@classmethod
def pick(cls) -> "Temps":
"""random-pick"""
return secrets.choice(list(cls))
class Nom(t.NamedTuple):
"""Nom commun"""
genre: Genre
sing: str
plur: str
def __str__(self) -> str:
return f"{self.sing}"
def accord(self, nombre: Nombre) -> str:
"""Accorde en nombre"""
return getattr(self, nombre.name.lower())
@property
def genre_or_pick(self) -> Genre:
"""Genre of the noun, or random-pick if invariable"""
if self.genre == Genre.INV:
return Genre.pick()
return self.genre
@property
def serialized(self):
return {"genre": self.genre.name, "sing": self.sing, "plur": self.plur}
@classmethod
def unserialized(cls, kwargs):
genre = Genre[kwargs.pop("genre")]
return cls(**kwargs, genre=genre)
class Adjectif(t.NamedTuple):
masc_sing: str
masc_plur: str
fem_sing: str
fem_plur: str
def __str__(self) -> str:
return f"{self.masc_sing}/{self.fem_sing}"
def accord(self, genre: Genre, nombre: Nombre) -> str:
"""Accorde en genre et en nombre"""
return getattr(self, f"{genre.name.lower()}_{nombre.name.lower()}")
@property
def serialized(self):
return self._asdict()
@classmethod
def unserialized(cls, kwargs):
return cls(**kwargs)
class Verbe(t.NamedTuple):
present_sing: str
present_plur: str
futur_sing: str
futur_plur: str
imparfait_sing: str
imparfait_plur: str
def __str__(self) -> str:
return f"{self.present_sing}"
def accord(self, temps: Temps, nombre: Nombre) -> str:
"""Accorde en temps et en nombre (seule la 3è pers. est utilisée)"""
return getattr(self, f"{temps.name.lower()}_{nombre.name.lower()}")
@property
def serialized(self):
return self._asdict()
@classmethod
def unserialized(cls, kwargs):
return cls(**kwargs)
class Adverbe(t.NamedTuple):
"""Packed as named tuple for consistence"""
adv: str
def __str__(self) -> str:
return self.adv
def accord(self) -> str:
"""for consistence"""
return self.adv
@property
def serialized(self):
return self._asdict()
@classmethod
def unserialized(cls, kwargs):
return cls(**kwargs)
class WordDb:
"""Base de donnée de mots, sérialisable"""
SERIALIZED_GZ_LOCATION = Path(__file__).parent.parent / "morphalou.json.gz"
_serialize_data: dict[str, t.Type[t.NamedTuple]] = {
"noms": Nom,
"adjectifs": Adjectif,
"verbes": Verbe,
"adverbes": Adverbe,
}
CATEGORY_TO_ATTR: dict = {
Nom: "noms",
Adjectif: "adjectifs",
Verbe: "verbes",
Adverbe: "adverbes",
}
noms: list[Nom]
adjectifs: list[Adjectif]
verbes: list[Verbe]
adverbes: list[Adverbe]
def __init__(
self,
noms: t.Optional[list[Nom]] = None,
adjectifs: t.Optional[list[Adjectif]] = None,
verbes: t.Optional[list[Verbe]] = None,
adverbes: t.Optional[list[Adverbe]] = None,
):
self.noms = noms or []
self.adjectifs = adjectifs or []
self.verbes = verbes or []
self.adverbes = adverbes or []
def serialize(self) -> dict:
"""Serialize to plain dictionary (no classes)"""
return {
attr: [x.serialized for x in getattr(self, attr)]
for attr in self.__class__._serialize_data
}
def save(self, fd):
"""Serialize to this stream"""
json.dump(self.serialize(), fd)
@classmethod
@t.no_type_check # serialization is messy
def unserialize(cls, data: dict) -> "WordDb":
"""Reverses :serialize:"""
parsed = {}
for attr, attr_cls in cls._serialize_data.items():
parsed[attr] = list(map(attr_cls.unserialized, data[attr]))
return cls(**parsed)
@classmethod
def load(cls, fd) -> "WordDb":
"""Unserialize from this stream"""
return cls.unserialize(json.load(fd))
@classmethod
def autoload(cls) -> "WordDb":
"""Unserialize from default source"""
with gzip.open(cls.SERIALIZED_GZ_LOCATION) as h:
return cls.load(h)

View file

@ -0,0 +1 @@
lxml

View file

@ -25,9 +25,7 @@ setup(
install_requires=parse_requirements(),
entry_points={
"console_scripts": [
# (
# "proxmox-snapshot-review = proxmox_scripts.snapshots:review_snapshots",
# ),
("pwgen-fr = pwgen_fr.entrypoints:pwgen_fr",),
]
},
)