pwgen-fr/pwgen_fr/word_db.py

206 lines
4.9 KiB
Python

""" A pre-processed database of words, independant of their source """
import gzip
import json
import secrets
import typing as t
from enum import Enum
from pathlib import Path
class Genre(Enum):
MASC = "masculin"
FEM = "féminin"
INV = "invariable" # pour les noms uniquement
@classmethod
def pick(cls) -> "Genre":
"""random-pick (avoids inv)"""
return secrets.choice([cls.masc, cls.fem])
class Nombre(Enum):
SING = "singulier"
PLUR = "pluriel"
@classmethod
def pick(cls) -> "Nombre":
"""random-pick"""
return secrets.choice(list(cls))
class Temps(Enum):
PRESENT = "present"
FUTUR = "futur"
IMPARFAIT = "imparfait"
@classmethod
def pick(cls) -> "Temps":
"""random-pick"""
return secrets.choice(list(cls))
class Nom(t.NamedTuple):
"""Nom commun"""
genre: Genre
sing: str
plur: str
def __str__(self) -> str:
return f"{self.sing}"
def accord(self, nombre: Nombre) -> str:
"""Accorde en nombre"""
return getattr(self, nombre.name.lower())
@property
def genre_or_pick(self) -> Genre:
"""Genre of the noun, or random-pick if invariable"""
if self.genre == Genre.INV:
return Genre.pick()
return self.genre
@property
def serialized(self):
return {"genre": self.genre.name, "sing": self.sing, "plur": self.plur}
@classmethod
def unserialized(cls, kwargs):
genre = Genre[kwargs.pop("genre")]
return cls(**kwargs, genre=genre)
class Adjectif(t.NamedTuple):
masc_sing: str
masc_plur: str
fem_sing: str
fem_plur: str
def __str__(self) -> str:
return f"{self.masc_sing}/{self.fem_sing}"
def accord(self, genre: Genre, nombre: Nombre) -> str:
"""Accorde en genre et en nombre"""
return getattr(self, f"{genre.name.lower()}_{nombre.name.lower()}")
@property
def serialized(self):
return self._asdict()
@classmethod
def unserialized(cls, kwargs):
return cls(**kwargs)
class Verbe(t.NamedTuple):
present_sing: str
present_plur: str
futur_sing: str
futur_plur: str
imparfait_sing: str
imparfait_plur: str
def __str__(self) -> str:
return f"{self.present_sing}"
def accord(self, temps: Temps, nombre: Nombre) -> str:
"""Accorde en temps et en nombre (seule la 3è pers. est utilisée)"""
return getattr(self, f"{temps.name.lower()}_{nombre.name.lower()}")
@property
def serialized(self):
return self._asdict()
@classmethod
def unserialized(cls, kwargs):
return cls(**kwargs)
class Adverbe(t.NamedTuple):
"""Packed as named tuple for consistence"""
adv: str
def __str__(self) -> str:
return self.adv
def accord(self) -> str:
"""for consistence"""
return self.adv
@property
def serialized(self):
return self._asdict()
@classmethod
def unserialized(cls, kwargs):
return cls(**kwargs)
class WordDb:
"""Base de donnée de mots, sérialisable"""
SERIALIZED_GZ_LOCATION = Path(__file__).parent.parent / "morphalou_full.json.gz"
_serialize_data: dict[str, t.Type[t.NamedTuple]] = {
"noms": Nom,
"adjectifs": Adjectif,
"verbes": Verbe,
"adverbes": Adverbe,
}
CATEGORY_TO_ATTR: dict = {
Nom: "noms",
Adjectif: "adjectifs",
Verbe: "verbes",
Adverbe: "adverbes",
}
noms: list[Nom]
adjectifs: list[Adjectif]
verbes: list[Verbe]
adverbes: list[Adverbe]
def __init__(
self,
noms: t.Optional[list[Nom]] = None,
adjectifs: t.Optional[list[Adjectif]] = None,
verbes: t.Optional[list[Verbe]] = None,
adverbes: t.Optional[list[Adverbe]] = None,
):
self.noms = noms or []
self.adjectifs = adjectifs or []
self.verbes = verbes or []
self.adverbes = adverbes or []
def serialize(self) -> dict:
"""Serialize to plain dictionary (no classes)"""
return {
attr: [x.serialized for x in getattr(self, attr)]
for attr in self.__class__._serialize_data
}
def save(self, fd):
"""Serialize to this stream"""
json.dump(self.serialize(), fd)
@classmethod
@t.no_type_check # serialization is messy
def unserialize(cls, data: dict) -> "WordDb":
"""Reverses :serialize:"""
parsed = {}
for attr, attr_cls in cls._serialize_data.items():
parsed[attr] = list(map(attr_cls.unserialized, data[attr]))
return cls(**parsed)
@classmethod
def load(cls, fd) -> "WordDb":
"""Unserialize from this stream"""
return cls.unserialize(json.load(fd))
@classmethod
def autoload(cls) -> "WordDb":
"""Unserialize from default source"""
with gzip.open(cls.SERIALIZED_GZ_LOCATION) as h:
return cls.load(h)