Compare commits

..

No commits in common. "accords" and "main" have entirely different histories.

11 changed files with 52 additions and 851 deletions

1
data/raw/.gitignore vendored
View file

@ -1,2 +1 @@
Lexique383 Lexique383
Morphalou3.1_formatTEI

View file

@ -1,29 +0,0 @@
# Versions réduites de jeux de données
Les fichiers dans ce dossier sont des versions réduites de jeux de données
tiers.
Veillez à respecter les licences respectives de ces ressources dans les usages
que vous en faites.
## Lexique
La base de données Lexique (http://www.lexique.org/) est le travail, entre
autres contributeurs et contributrices, de Boris New et Christophe Pallier,
sous licence CC BY-NC
Le fichier présent ici est une version tronquée de la v3.83. Il ne conserve que
la partie utile au présent logiciel. Le jeu de données entier est disponible
sur leur site.
## Morphalou
La base de données Morphalou
(https://www.ortolang.fr/market/lexicons/morphalou/v3.1) est le travail, entre
autres contributeurs et contributrices, de Sandrine Ollinger, Christophe
Benzitoun, Evelyne Jacquey, Ulrike Fleury, Etienne Petitjean et Marie
Tonnelier. Sa version 3.1 est distribuée sous licence LGPL-LR.
Le fichier présent ici est une version tronquée de la v3.1. Il ne conserve que
la partie utile au présent logiciel. Le jeu de données entier est disponible
sur leur site.

View file

View file

@ -1,22 +0,0 @@
import argparse
import typing as t
from . import generate
def pwgen_fr():
choices_map: dict[str, t.Callable[[], str]] = {
"phrase4": generate.gen_phrase4,
"phrase6": generate.gen_phrase6,
"rand4": lambda: generate.gen_rand(n=4),
"rand6": lambda: generate.gen_rand(n=4),
}
parser = argparse.ArgumentParser()
parser.add_argument(
"mode", choices=choices_map.keys(), help="Select the generation procedure used"
)
args = parser.parse_args()
print(choices_map[args.mode]())

View file

@ -1,72 +1,37 @@
import secrets import secrets
from . import word_db from . import lexique
wdb = word_db.WordDb.autoload() lex = lexique.Lexique.parse()
def gen_phrase4() -> str: def gen_phrase4():
"""Generates a sentence with four words, of structure Adjective Noun Verb Adverb""" out = []
nombre = word_db.Nombre.pick() out.append(secrets.choice(lex.most_common(lexique.CatGram.ADJECTIF)))
temps = word_db.Temps.pick() out.append(secrets.choice(lex.most_common(lexique.CatGram.NOM)))
out.append(secrets.choice(lex.most_common(lexique.CatGram.VERBE)))
adj = secrets.choice(wdb.adjectifs) out.append(secrets.choice(lex.most_common(lexique.CatGram.NOM)))
nom = secrets.choice(wdb.noms) return " ".join(map(lambda x: x.word, out))
verbe = secrets.choice(wdb.verbes)
adverbe = secrets.choice(wdb.adverbes)
return " ".join(
[
adj.accord(nom.genre_or_pick, nombre),
nom.accord(nombre),
verbe.accord(temps, nombre),
adverbe.accord(),
]
)
def gen_phrase6() -> str: def gen_rand(n=4):
"""Generates a sentence with six words, of structure Adjective Noun Verb Adjective
Noun Adverb"""
nombres = [word_db.Nombre.pick() for _ in range(2)]
temps = word_db.Temps.pick()
adj0 = secrets.choice(wdb.adjectifs)
nom0 = secrets.choice(wdb.noms)
verbe = secrets.choice(wdb.verbes)
adj1 = secrets.choice(wdb.adjectifs)
nom1 = secrets.choice(wdb.noms)
adverbe = secrets.choice(wdb.adverbes)
return " ".join(
[
adj0.accord(nom0.genre_or_pick, nombres[0]),
nom0.accord(nombres[0]),
verbe.accord(temps, nombres[0]),
adj1.accord(nom1.genre_or_pick, nombres[1]),
nom1.accord(nombres[1]),
adverbe.accord(),
]
)
def gen_rand(n=4) -> str:
"""Generates a fully random sequence of n words, without grammatical consistency"""
out = [] out = []
for _ in range(n): for _ in range(n):
word_cat = secrets.choice(list(wdb.CATEGORY_TO_ATTR)) cat = secrets.choice(
if word_cat == word_db.Nom: (
nombre = word_db.Nombre.pick() lexique.CatGram.ADJECTIF,
out.append(secrets.choice(wdb.noms).accord(nombre)) lexique.CatGram.NOM,
elif word_cat == word_db.Adjectif: lexique.CatGram.VERBE,
genre = word_db.Genre.pick() lexique.CatGram.ADVERBE,
nombre = word_db.Nombre.pick() )
out.append(secrets.choice(wdb.adjectifs).accord(genre, nombre)) )
elif word_cat == word_db.Verbe: out.append(secrets.choice(lex.most_common(cat)))
temps = word_db.Temps.pick() return " ".join(map(lambda x: x.word, out))
nombre = word_db.Nombre.pick()
out.append(secrets.choice(wdb.verbes).accord(temps, nombre))
elif word_cat == word_db.Adverbe:
out.append(secrets.choice(wdb.adverbes).accord())
return " ".join(out)
def gen_nom(n=4):
out = []
for _ in range(n):
cat = lexique.CatGram.NOM
out.append(secrets.choice(lex.most_common(cat)))
return " ".join(map(lambda x: x.word, out))

View file

@ -1,13 +1,9 @@
import csv import csv
import itertools
from dataclasses import dataclass, field
import logging import logging
import subprocess import subprocess
import typing as t import typing as t
from bisect import bisect_left
import enum import enum
from pathlib import Path from pathlib import Path
from .word_db import Genre, Nombre, Temps, Nom, Adjectif, Verbe, Adverbe, WordDb
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -31,25 +27,13 @@ class CatGram(enum.Enum):
base = val.split(":", maxsplit=1)[0] base = val.split(":", maxsplit=1)[0]
return cls(base) return cls(base)
def __lt__(self, oth):
return self.value < oth.value
class Word(t.NamedTuple):
def match_enum_or_all(val: str, enum_mapper, enum_cls) -> list: word: str
"""The value of the enum corresponding if any; else, all terms of the enum""" lemme: str # canonical form
if val in enum_mapper:
return [enum_mapper[val]]
return list(enum_cls)
@dataclass
class Mot:
mot: str
lemme: str
cat_gram: CatGram cat_gram: CatGram
freq: float # occurrences of the canonical form by million words freq_lem: float # occurrences of the canonical form, in films, by million words
variantes: dict[tuple, str] = field(default_factory=dict) freq: float # occurrences of this exact form, in films, by million words
genre: t.Optional[Genre] = None
class Lexique: class Lexique:
@ -63,38 +47,10 @@ class Lexique:
CatGram.ADVERBE: 10000, CatGram.ADVERBE: 10000,
} }
class Parsers: dataset: list[Word]
"""Datatables to help parse the original data"""
genre: dict[str, Genre] = { def __init__(self, dataset):
"m": Genre.MASC,
"f": Genre.FEM,
}
rev_genre: dict[t.Optional[Genre], str] = {
None: "",
Genre.MASC: "m",
Genre.FEM: "f",
}
nombre: dict[str, Nombre] = {
"s": Nombre.SING,
"p": Nombre.PLUR,
}
verbe_temps: dict[str, Temps] = {
"ind:pre": Temps.PRESENT,
"ind:fut": Temps.FUTUR,
"ind:imp": Temps.IMPARFAIT,
}
verbe_personne: dict[str, Nombre] = {
"3s": Nombre.SING,
"3p": Nombre.PLUR,
}
dataset: list[Mot]
lemfreq: dict[str, float]
def __init__(self, dataset, lemfreq):
self.dataset = dataset self.dataset = dataset
self.lemfreq = lemfreq
@classmethod @classmethod
def _ensure_uncompressed(cls): def _ensure_uncompressed(cls):
@ -127,117 +83,32 @@ class Lexique:
f"Uncompressed dataset still missing at {cls.LEXIQUE_DIR_PATH} after extraction" f"Uncompressed dataset still missing at {cls.LEXIQUE_DIR_PATH} after extraction"
) )
@classmethod
def _find_word_key(cls, mot: Mot):
return (mot.lemme, mot.cat_gram, cls.Parsers.rev_genre[mot.genre])
@classmethod
def _find_word(cls, dataset: list[Mot], row: dict) -> t.Optional[Mot]:
str_lemme = row["lemme"]
cat_gram = CatGram.parse(row["cgram"])
genre = row["genre"] if cat_gram == CatGram.NOM else ""
row_key = (
str_lemme,
cat_gram,
genre,
)
lemme_pos = bisect_left(
dataset,
row_key,
key=cls._find_word_key,
)
if lemme_pos >= len(dataset):
return None
out = dataset[lemme_pos]
if row_key != cls._find_word_key(out):
return None
return dataset[lemme_pos]
@classmethod @classmethod
def parse(cls) -> "Lexique": def parse(cls) -> "Lexique":
out = [] out = []
rows = []
lemfreq: dict[str, float] = {}
with cls.LEXIQUE_PATH.open("r") as h: with cls.LEXIQUE_PATH.open("r") as h:
reader = csv.DictReader(h, dialect="excel-tab") reader = csv.DictReader(h, dialect="excel-tab")
for row in reader: for row in reader:
if not row["cgram"]: if not row["cgram"]:
continue continue
rows.append(row) try:
out.append(
# First pass: generate canonical forms (lemmes) Word(
for row in rows: word=row["ortho"],
cat_gram = CatGram.parse(row["cgram"]) lemme=row["lemme"],
if (row["lemme"] != row["ortho"]) and not ( cat_gram=CatGram.parse(row["cgram"]),
cat_gram == CatGram.NOM and row["genre"] == "f" and row["nombre"] == "s" freq_lem=float(row["freqlemlivres"]),
): freq=float(row["freqlivres"]),
# Un nom singulier féminin est considéré comme forme canonique )
continue )
except ValueError as exn:
genre: t.Optional[Genre] = None print(row)
if cat_gram == CatGram.NOM: raise exn from exn
genre = cls.Parsers.genre.get(row["genre"], None) return cls(out)
out.append(
Mot(
mot=row["ortho"],
lemme=row["lemme"],
cat_gram=cat_gram,
freq=float(row["freqlemlivres"]),
genre=genre,
)
)
out.sort(key=cls._find_word_key) # We need to bisect on this.
# Second pass: populate variants
for row in rows:
# Populate lemfreq
old_freq = lemfreq.get(row["ortho"], 0.0)
lemfreq[row["ortho"]] = max(
old_freq,
float(row["freqlemlivres"]),
float(row["freqlemfilms2"]),
)
lemme = cls._find_word(out, row)
if lemme is None:
continue
if lemme.cat_gram == CatGram.NOM:
nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre)
for nombre in nombres:
lemme.variantes[(nombre,)] = row["ortho"]
elif lemme.cat_gram == CatGram.VERBE:
infover = row["infover"].split(";")
for raw_ver in infover:
ver = raw_ver.split(":")
temps = None
personne = None
temps_select = ":".join(ver[0:2])
if temps_select not in Temps:
continue
temps = Temps(temps_select)
personne = cls.Parsers.verbe_personne.get(ver[2], None)
if personne is None:
continue # we're not interested in all conj. persons
lemme.variantes[(temps, personne)] = row["ortho"]
elif lemme.cat_gram == CatGram.ADJECTIF:
genres = match_enum_or_all(row["genre"], cls.Parsers.genre, Genre)
nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre)
for genre, nombre in itertools.product(genres, nombres):
lemme.variantes[(genre, nombre)] = row["ortho"]
# No need to match adverbs (invariant)
return cls(out, lemfreq)
def most_common( def most_common(
self, cat_gram: CatGram, threshold: t.Optional[int] = None self, cat_gram: CatGram, threshold: t.Optional[int] = None
) -> list[Mot]: ) -> list[Word]:
if threshold is None: if threshold is None:
try: try:
threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram] threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram]
@ -249,52 +120,3 @@ class Lexique:
out = list(filter(lambda word: word.cat_gram == cat_gram, self.dataset)) out = list(filter(lambda word: word.cat_gram == cat_gram, self.dataset))
out.sort(key=lambda word: word.freq, reverse=True) out.sort(key=lambda word: word.freq, reverse=True)
return out[:threshold] return out[:threshold]
def word_db(self, thresholds: t.Optional[dict[CatGram, int]] = None) -> WordDb:
"""Convert to a WordDb"""
thresholds = thresholds or {}
noms = self.most_common(CatGram.NOM, thresholds.get(CatGram.NOM, None))
db_noms = [
Nom(
genre=t.cast(Genre, nom.genre), # not None for noms
sing=nom.variantes[(Nombre.SING,)],
plur=nom.variantes[(Nombre.PLUR,)],
)
for nom in noms
]
adjectifs = self.most_common(
CatGram.ADJECTIF, thresholds.get(CatGram.ADJECTIF, None)
)
db_adjectifs = [
Adjectif(
masc_sing=adj.variantes[(Genre.MASC, Nombre.SING)],
masc_plur=adj.variantes[(Genre.MASC, Nombre.PLUR)],
fem_sing=adj.variantes[(Genre.FEM, Nombre.SING)],
fem_plur=adj.variantes[(Genre.FEM, Nombre.PLUR)],
)
for adj in adjectifs
]
verbes = self.most_common(CatGram.VERBE, thresholds.get(CatGram.VERBE, None))
db_verbes = [
Verbe(
present_sing=verbe.variantes[(Temps.PRESENT, Nombre.SING)],
present_plur=verbe.variantes[(Temps.PRESENT, Nombre.PLUR)],
futur_sing=verbe.variantes[(Temps.FUTUR, Nombre.SING)],
futur_plur=verbe.variantes[(Temps.FUTUR, Nombre.PLUR)],
imparfait_sing=verbe.variantes[(Temps.IMPARFAIT, Nombre.SING)],
imparfait_plur=verbe.variantes[(Temps.IMPARFAIT, Nombre.PLUR)],
)
for verbe in verbes
]
adverbes = self.most_common(
CatGram.ADVERBE, thresholds.get(CatGram.ADVERBE, None)
)
db_adverbes = [Adverbe(adv=adv.mot) for adv in adverbes]
return WordDb(
noms=db_noms, adjectifs=db_adjectifs, verbes=db_verbes, adverbes=db_adverbes
)

View file

@ -1,256 +0,0 @@
""" Reads the Morphalou dataset, in its TSV form """
import itertools
import logging
import subprocess
import typing as t
from pathlib import Path
from lxml import etree
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
TSV_NS = {
"tsv": "http://www.tei-c.org/ns/1.0",
"xml": "http://www.w3.org/XML/1998/namespace",
}
logger = logging.getLogger(__name__)
class MorphalouSet:
MORPHALOU_DIR_PATH = (
Path(__file__).parent.parent / "data/raw/Morphalou3.1_formatTEI"
)
MORPHALOU_FILENAME_TEMPLATE = "{cat_name}_Morphalou3.1_TEI.xml"
CAT_MAPPING: dict[t.Type[t.NamedTuple], str] = {
Nom: "commonNoun",
Adjectif: "adjective",
Verbe: "verb",
Adverbe: "adverb",
}
word_db: WordDb
def __init__(self):
self.word_db = WordDb()
@classmethod
def _ensure_uncompressed(cls):
"""Ensures the dataset is uncompressed"""
if cls.MORPHALOU_DIR_PATH.exists():
return
lexique_archive = cls.MORPHALOU_DIR_PATH.with_suffix(".tar.xz")
if not lexique_archive.exists():
logger.error("Missing compressed dataset at %s", lexique_archive)
raise Exception(f"Missing compressed dataset at {lexique_archive}")
logger.info("Uncompressing dataset")
subprocess.check_call(
[
"tar",
"-xJf",
lexique_archive.as_posix(),
"-C",
lexique_archive.parent.as_posix(),
]
)
if not cls.MORPHALOU_DIR_PATH.exists():
logger.error(
"Uncompressed dataset still missing at %s after extraction",
cls.MORPHALOU_DIR_PATH,
)
raise Exception(
f"Uncompressed dataset still missing at {cls.MORPHALOU_DIR_PATH} after extraction"
)
def parse(self):
"""Parses the dataset"""
self.__class__._ensure_uncompressed()
for cat, cat_file in self.__class__.CAT_MAPPING.items():
word_db_elt = WordDb.CATEGORY_TO_ATTR[cat]
logging.info("Parsing %s...", word_db_elt)
setattr(
self.word_db,
word_db_elt,
getattr(self, f"_parse_{word_db_elt}")(
self.__class__.MORPHALOU_DIR_PATH
/ self.__class__.MORPHALOU_FILENAME_TEMPLATE.format(
cat_name=cat_file
)
),
)
def _tsv_elems(self, tsv_path: Path):
"""Opens a TSV file, and returns the <body> node, direct parent of all the
relevant nodes"""
with tsv_path.open("r") as h:
tree = etree.parse(h)
root = tree.getroot()
body = root.find("./tsv:text/tsv:body", TSV_NS)
return body
def _parse_noms(self, tsv_path: Path) -> list[Nom]:
"""Parse the nouns"""
root = self._tsv_elems(tsv_path)
out: list[Nom] = []
for entry in root.iterfind("./tsv:entry", TSV_NS):
try:
genre = self._genre(
entry.find(
"./tsv:form[@type='lemma']/tsv:gramGrp/tsv:gen", TSV_NS
).text
)
except AttributeError:
continue # some nouns don't have a gender defined, somehow -- ignore
forms = {}
for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
orth = inflected.find("./tsv:orth", TSV_NS).text
nombres = self._nombre_set(
inflected.find("./tsv:gramGrp/tsv:number", TSV_NS).text
)
for form in nombres:
forms[form] = orth
try:
out.append(
Nom(
genre=genre,
sing=forms[Nombre.SING],
plur=forms[Nombre.PLUR],
)
)
except KeyError:
continue # cannot be inflected to all required forms: skip
return out
def _parse_adjectifs(self, tsv_path: Path) -> list[Adjectif]:
"""Parse the adjectives"""
root = self._tsv_elems(tsv_path)
out: list[Adjectif] = []
for entry in root.iterfind("./tsv:entry", TSV_NS):
forms = {}
for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
orth = inflected.find("./tsv:orth", TSV_NS).text
gram_grp = inflected.find("./tsv:gramGrp", TSV_NS)
genres = self._genre_set(gram_grp.find("./tsv:gen", TSV_NS).text)
nombres = self._nombre_set(gram_grp.find("./tsv:number", TSV_NS).text)
for form in itertools.product(genres, nombres):
forms[form] = orth
try:
out.append(
Adjectif(
masc_sing=forms[Genre.MASC, Nombre.SING],
masc_plur=forms[Genre.MASC, Nombre.PLUR],
fem_sing=forms[Genre.FEM, Nombre.SING],
fem_plur=forms[Genre.FEM, Nombre.PLUR],
)
)
except KeyError:
continue # cannot be inflected to all required forms: skip
return out
def _parse_verbes(self, tsv_path: Path) -> list[Verbe]:
"""Parse the verbs"""
root = self._tsv_elems(tsv_path)
out: list[Verbe] = []
for entry in root.iterfind("./tsv:entry", TSV_NS):
forms = {}
for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
gram_grp = inflected.find("./tsv:gramGrp", TSV_NS)
# Order of tests is important! If mood == 'participle', there is no
# 'person' defined.
if (
gram_grp.find("./tsv:mood", TSV_NS).text != "indicative"
or gram_grp.find("./tsv:per", TSV_NS).text != "thirdPerson"
):
continue # irrelevant for us
temps = self._tense(gram_grp.find("./tsv:tns", TSV_NS).text)
if temps is None:
continue # irrelevant for us
nombres = self._nombre_set(gram_grp.find("./tsv:number", TSV_NS).text)
orth = inflected.find("./tsv:orth", TSV_NS).text
for nombre in nombres:
forms[(temps, nombre)] = orth
try:
out.append(
Verbe(
present_sing=forms[Temps.PRESENT, Nombre.SING],
present_plur=forms[Temps.PRESENT, Nombre.PLUR],
futur_sing=forms[Temps.FUTUR, Nombre.SING],
futur_plur=forms[Temps.FUTUR, Nombre.PLUR],
imparfait_sing=forms[Temps.IMPARFAIT, Nombre.SING],
imparfait_plur=forms[Temps.IMPARFAIT, Nombre.PLUR],
)
)
except KeyError:
continue # cannot be inflected to all required forms: skip
return out
def _parse_adverbes(self, tsv_path: Path) -> list[Adverbe]:
"""Parse the adverbs"""
root = self._tsv_elems(tsv_path)
out: list[Adverbe] = []
for entry in root.iterfind("./tsv:entry", TSV_NS):
# We're only interested in the lemma form
orth = entry.find("./tsv:form[@type='lemma']/tsv:orth", TSV_NS)
assert orth is not None
adv = orth.text
out.append(Adverbe(adv=adv))
return out
@staticmethod
def _genre_set(genre: str) -> list[Genre]:
return {
"masculine": [Genre.MASC],
"feminine": [Genre.FEM],
"invariable": [Genre.MASC, Genre.FEM],
}[genre]
@staticmethod
def _genre(genre: str) -> Genre:
return {
"masculine": Genre.MASC,
"feminine": Genre.FEM,
"invariable": Genre.INV,
}[genre]
@staticmethod
def _nombre(nombre: str) -> Nombre:
return {
"singular": Nombre.SING,
"plural": Nombre.PLUR,
}[nombre]
@staticmethod
def _nombre_set(nombre: str) -> list[Nombre]:
return {
"singular": [Nombre.SING],
"plural": [Nombre.PLUR],
"invariable": [Nombre.SING, Nombre.PLUR],
}[nombre]
@staticmethod
def _tense(tense: str) -> t.Optional[Temps]:
return {
"present": Temps.PRESENT,
"imperfect": Temps.IMPARFAIT,
"future": Temps.FUTUR,
}.get(tense, None)

View file

@ -1,74 +0,0 @@
""" Generates a worddb based on Morphalou, but limits to frequent words based on
external sources (eg Lexique) """
import logging
import typing as t
from . import lexique, morphalou
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
logger = logging.getLogger(__name__)
class MorphalouFreqSet:
morphalou_db: WordDb
lexique: lexique.Lexique
filtered_db: WordDb
def __init__(
self,
morphalou_db: t.Optional[WordDb] = None,
lexique: t.Optional[lexique.Lexique] = None,
):
if not morphalou_db:
morphalou_set = morphalou.MorphalouSet()
morphalou_set.parse()
self.morphalou_db = morphalou_set.word_db
else:
self.morphalou_db = morphalou_db
if not lexique:
self.lexique = lexique.Lexique.parse()
else:
self.lexique = lexique
self.filtered_db = self._filter_lexique()
def _filter_nom(self, nom: Nom) -> bool:
freq = max(
self.lexique.lemfreq.get(nom.sing, 0.0),
self.lexique.lemfreq.get(nom.plur, 0.0),
)
return freq > 0
def _filter_adjectif(self, adjectif: Adjectif) -> bool:
freq = max(
self.lexique.lemfreq.get(adjectif.masc_sing, 0.0),
self.lexique.lemfreq.get(adjectif.fem_sing, 0.0),
)
return freq > 0
def _filter_verbe(self, verbe: Verbe) -> bool:
freq = max(
self.lexique.lemfreq.get(verbe.present_sing, 0.0),
self.lexique.lemfreq.get(verbe.futur_sing, 0.0),
self.lexique.lemfreq.get(verbe.imparfait_sing, 0.0),
self.lexique.lemfreq.get(verbe.present_plur, 0.0),
self.lexique.lemfreq.get(verbe.futur_plur, 0.0),
self.lexique.lemfreq.get(verbe.imparfait_plur, 0.0),
)
return freq > 0
def _filter_adverbe(self, adverbe: Adverbe) -> bool:
if " " in adverbe.adv:
return False
freq = self.lexique.lemfreq.get(adverbe.adv, 0.0)
return freq > 0
def _filter_lexique(self) -> WordDb:
out = WordDb()
out.noms = list(filter(self._filter_nom, self.morphalou_db.noms))
out.adjectifs = list(filter(self._filter_adjectif, self.morphalou_db.adjectifs))
out.verbes = list(filter(self._filter_verbe, self.morphalou_db.verbes))
out.adverbes = list(filter(self._filter_adverbe, self.morphalou_db.adverbes))
return out

View file

@ -1,206 +0,0 @@
""" A pre-processed database of words, independant of their source """
import gzip
import json
import secrets
import typing as t
from enum import Enum
from pathlib import Path
class Genre(Enum):
MASC = "masculin"
FEM = "féminin"
INV = "invariable" # pour les noms uniquement
@classmethod
def pick(cls) -> "Genre":
"""random-pick (avoids inv)"""
return secrets.choice([cls.MASC, cls.FEM])
class Nombre(Enum):
SING = "singulier"
PLUR = "pluriel"
@classmethod
def pick(cls) -> "Nombre":
"""random-pick"""
return secrets.choice(list(cls))
class Temps(Enum):
PRESENT = "present"
FUTUR = "futur"
IMPARFAIT = "imparfait"
@classmethod
def pick(cls) -> "Temps":
"""random-pick"""
return secrets.choice(list(cls))
class Nom(t.NamedTuple):
"""Nom commun"""
genre: Genre
sing: str
plur: str
def __str__(self) -> str:
return f"{self.sing}"
def accord(self, nombre: Nombre) -> str:
"""Accorde en nombre"""
return getattr(self, nombre.name.lower())
@property
def genre_or_pick(self) -> Genre:
"""Genre of the noun, or random-pick if invariable"""
if self.genre == Genre.INV:
return Genre.pick()
return self.genre
@property
def serialized(self):
return {"genre": self.genre.name, "sing": self.sing, "plur": self.plur}
@classmethod
def unserialized(cls, kwargs):
genre = Genre[kwargs.pop("genre")]
return cls(**kwargs, genre=genre)
class Adjectif(t.NamedTuple):
masc_sing: str
masc_plur: str
fem_sing: str
fem_plur: str
def __str__(self) -> str:
return f"{self.masc_sing}/{self.fem_sing}"
def accord(self, genre: Genre, nombre: Nombre) -> str:
"""Accorde en genre et en nombre"""
return getattr(self, f"{genre.name.lower()}_{nombre.name.lower()}")
@property
def serialized(self):
return self._asdict()
@classmethod
def unserialized(cls, kwargs):
return cls(**kwargs)
class Verbe(t.NamedTuple):
present_sing: str
present_plur: str
futur_sing: str
futur_plur: str
imparfait_sing: str
imparfait_plur: str
def __str__(self) -> str:
return f"{self.present_sing}"
def accord(self, temps: Temps, nombre: Nombre) -> str:
"""Accorde en temps et en nombre (seule la 3è pers. est utilisée)"""
return getattr(self, f"{temps.name.lower()}_{nombre.name.lower()}")
@property
def serialized(self):
return self._asdict()
@classmethod
def unserialized(cls, kwargs):
return cls(**kwargs)
class Adverbe(t.NamedTuple):
"""Packed as named tuple for consistence"""
adv: str
def __str__(self) -> str:
return self.adv
def accord(self) -> str:
"""for consistence"""
return self.adv
@property
def serialized(self):
return self._asdict()
@classmethod
def unserialized(cls, kwargs):
return cls(**kwargs)
class WordDb:
"""Base de donnée de mots, sérialisable"""
SERIALIZED_GZ_LOCATION = Path(__file__).parent.parent / "morphalou.json.gz"
_serialize_data: dict[str, t.Type[t.NamedTuple]] = {
"noms": Nom,
"adjectifs": Adjectif,
"verbes": Verbe,
"adverbes": Adverbe,
}
CATEGORY_TO_ATTR: dict = {
Nom: "noms",
Adjectif: "adjectifs",
Verbe: "verbes",
Adverbe: "adverbes",
}
noms: list[Nom]
adjectifs: list[Adjectif]
verbes: list[Verbe]
adverbes: list[Adverbe]
def __init__(
self,
noms: t.Optional[list[Nom]] = None,
adjectifs: t.Optional[list[Adjectif]] = None,
verbes: t.Optional[list[Verbe]] = None,
adverbes: t.Optional[list[Adverbe]] = None,
):
self.noms = noms or []
self.adjectifs = adjectifs or []
self.verbes = verbes or []
self.adverbes = adverbes or []
def serialize(self) -> dict:
"""Serialize to plain dictionary (no classes)"""
return {
attr: [x.serialized for x in getattr(self, attr)]
for attr in self.__class__._serialize_data
}
def save(self, fd):
"""Serialize to this stream"""
json.dump(self.serialize(), fd)
@classmethod
@t.no_type_check # serialization is messy
def unserialize(cls, data: dict) -> "WordDb":
"""Reverses :serialize:"""
parsed = {}
for attr, attr_cls in cls._serialize_data.items():
parsed[attr] = list(map(attr_cls.unserialized, data[attr]))
return cls(**parsed)
@classmethod
def load(cls, fd) -> "WordDb":
"""Unserialize from this stream"""
return cls.unserialize(json.load(fd))
@classmethod
def autoload(cls) -> "WordDb":
"""Unserialize from default source"""
with gzip.open(cls.SERIALIZED_GZ_LOCATION) as h:
return cls.load(h)

View file

@ -25,7 +25,9 @@ setup(
install_requires=parse_requirements(), install_requires=parse_requirements(),
entry_points={ entry_points={
"console_scripts": [ "console_scripts": [
("pwgen-fr = pwgen_fr.entrypoints:pwgen_fr",), # (
# "proxmox-snapshot-review = proxmox_scripts.snapshots:review_snapshots",
# ),
] ]
}, },
) )