Compare commits

...

3 commits

Author SHA1 Message Date
f3df51ae26 Lexique: work towards word_db
Still problems with other forms
2024-09-10 00:30:29 +02:00
cf455556be Add __init__ for mypy 2024-09-10 00:30:16 +02:00
ff4a71d1a7 Add word_db as a target 2024-09-10 00:30:03 +02:00
3 changed files with 275 additions and 97 deletions

0
pwgen_fr/__init__.py Normal file
View file

View file

@ -1,12 +1,13 @@
import csv import csv
import itertools import itertools
from dataclasses import dataclass from dataclasses import dataclass, field
import logging import logging
import subprocess import subprocess
import typing as t import typing as t
from bisect import bisect_left from bisect import bisect_left
import enum import enum
from pathlib import Path from pathlib import Path
from .word_db import Genre, Nombre, Temps, Nom, Adjectif, Verbe, Adverbe, WordDb
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -34,84 +35,21 @@ class CatGram(enum.Enum):
return self.value < oth.value return self.value < oth.value
def match_enum_or_all(val, enum_cls) -> list: def match_enum_or_all(val: str, enum_mapper, enum_cls) -> list:
"""The value of the enum corresponding if any; else, all terms of the enum""" """The value of the enum corresponding if any; else, all terms of the enum"""
if val in enum_cls: if val in enum_mapper:
return [enum_cls(val)] return [enum_mapper[val]]
return list(enum_cls) return list(enum_cls)
class Genre(enum.Enum):
MASC = "m"
FEM = "f"
class Nombre(enum.Enum):
SING = "s"
PLUR = "p"
class Temps(enum.Enum):
INFINITIF = "inf"
PRESENT = "ind:pre"
FUTUR = "ind:fut"
IMPARFAIT = "ind:imp"
class Personne(enum.Enum):
S1 = "1s"
S2 = "2s"
S3 = "3s"
P1 = "1p"
P2 = "2p"
P3 = "3p"
@dataclass @dataclass
class _Mot: class Mot:
"""Canonical form of a word"""
mot: str mot: str
lemme: str
cat_gram: CatGram cat_gram: CatGram
freq: float # occurrences of the canonical form by million words freq: float # occurrences of the canonical form by million words
variantes: dict[tuple, str] = field(default_factory=dict)
genre: t.Optional[Genre] = None
class Mot(_Mot):
class Variant:
pass
_for_cat_gram: dict[CatGram, t.Type["Mot"]] = {}
_variants: dict
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._variants = {}
def accord(self, variant: Variant) -> str:
return self._variants[variant]
@classmethod
def for_cat_gram(cls, cat_gram: CatGram) -> t.Type["Mot"]:
"""The class to use for a word of given CatGram"""
return cls._for_cat_gram.get(cat_gram, cls)
class Nom(Mot):
class Variant(t.NamedTuple):
genre: Genre
nombre: Nombre
class Verbe(Mot):
class Variant(t.NamedTuple):
temps: Temps
personne: t.Optional[Personne]
Mot._for_cat_gram = {
CatGram.NOM: Nom,
CatGram.VERBE: Verbe,
}
class Lexique: class Lexique:
@ -125,6 +63,32 @@ class Lexique:
CatGram.ADVERBE: 10000, CatGram.ADVERBE: 10000,
} }
class Parsers:
"""Datatables to help parse the original data"""
genre: dict[str, Genre] = {
"m": Genre.MASC,
"f": Genre.FEM,
}
rev_genre: dict[t.Optional[Genre], str] = {
None: "",
Genre.MASC: "m",
Genre.FEM: "f",
}
nombre: dict[str, Nombre] = {
"s": Nombre.SING,
"p": Nombre.PLUR,
}
verbe_temps: dict[str, Temps] = {
"ind:pre": Temps.PRESENT,
"ind:fut": Temps.FUTUR,
"ind:imp": Temps.IMPARFAIT,
}
verbe_personne: dict[str, Nombre] = {
"3s": Nombre.SING,
"3p": Nombre.PLUR,
}
dataset: list[Mot] dataset: list[Mot]
def __init__(self, dataset): def __init__(self, dataset):
@ -161,6 +125,32 @@ class Lexique:
f"Uncompressed dataset still missing at {cls.LEXIQUE_DIR_PATH} after extraction" f"Uncompressed dataset still missing at {cls.LEXIQUE_DIR_PATH} after extraction"
) )
@classmethod
def _find_word_key(cls, mot: Mot):
return (mot.lemme, mot.cat_gram, cls.Parsers.rev_genre[mot.genre])
@classmethod
def _find_word(cls, dataset: list[Mot], row: dict) -> t.Optional[Mot]:
str_lemme = row["lemme"]
cat_gram = CatGram.parse(row["cgram"])
genre = row["genre"] if cat_gram == CatGram.NOM else ""
row_key = (
str_lemme,
cat_gram,
genre,
)
lemme_pos = bisect_left(
dataset,
row_key,
key=cls._find_word_key,
)
if lemme_pos >= len(dataset):
return None
out = dataset[lemme_pos]
if row_key != cls._find_word_key(out):
return None
return dataset[lemme_pos]
@classmethod @classmethod
def parse(cls) -> "Lexique": def parse(cls) -> "Lexique":
out = [] out = []
@ -174,34 +164,38 @@ class Lexique:
# First pass: generate canonical forms (lemmes) # First pass: generate canonical forms (lemmes)
for row in rows: for row in rows:
if row["lemme"] != row["ortho"]:
continue
cat_gram = CatGram.parse(row["cgram"]) cat_gram = CatGram.parse(row["cgram"])
if (row["lemme"] != row["ortho"]) and not (
cat_gram == CatGram.NOM and row["genre"] == "f" and row["nombre"] == "s"
):
# Un nom singulier féminin est considéré comme forme canonique
continue
genre: t.Optional[Genre] = None
if cat_gram == CatGram.NOM:
genre = cls.Parsers.genre.get(row["genre"], None)
out.append( out.append(
Mot.for_cat_gram(cat_gram)( Mot(
mot=row["ortho"], mot=row["ortho"],
lemme=row["lemme"],
cat_gram=cat_gram, cat_gram=cat_gram,
freq=float(row["freqlemlivres"]), freq=float(row["freqlemlivres"]),
genre=genre,
) )
) )
out.sort(key=lambda x: (x.mot, x.cat_gram)) # We need to bisect on this. out.sort(key=cls._find_word_key) # We need to bisect on this.
# Second pass: populate variants # Second pass: populate variants
for row in rows: for row in rows:
str_lemme = row["lemme"] lemme = cls._find_word(out, row)
cat_gram = CatGram.parse(row['cgram']) if lemme is None:
lemme_pos = bisect_left(out, (str_lemme, cat_gram), key=lambda x: (x.mot, x.cat_gram)) continue
if lemme_pos > len(out) or out[lemme_pos].mot != str_lemme:
continue # Unknown word
lemme = out[lemme_pos]
if lemme.cat_gram == CatGram.NOM: if lemme.cat_gram == CatGram.NOM:
genres = match_enum_or_all(row["genre"], Genre) nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre)
nombres = match_enum_or_all(row["nombre"], Nombre) for nombre in nombres:
for genre, nombre in itertools.product(genres, nombres): lemme.variantes[(nombre,)] = row["ortho"]
variant = Nom.Variant(genre=genre, nombre=nombre)
lemme._variants[variant] = row["ortho"]
elif lemme.cat_gram == CatGram.VERBE: elif lemme.cat_gram == CatGram.VERBE:
infover = row["infover"].split(";") infover = row["infover"].split(";")
@ -210,20 +204,23 @@ class Lexique:
temps = None temps = None
personne = None personne = None
if ver[0] == "inf":
temps = Temps(ver[0])
elif ver[0] == "ind":
temps_select = ":".join(ver[0:2]) temps_select = ":".join(ver[0:2])
if temps_select not in Temps: if temps_select not in Temps:
continue continue
temps = Temps(temps_select) temps = Temps(temps_select)
personne = Personne(ver[2]) personne = cls.Parsers.verbe_personne.get(ver[2], None)
else: if personne is None:
continue continue # we're not interested in all conj. persons
variant = Verbe.Variant(temps=temps, personne=personne) lemme.variantes[(temps, personne)] = row["ortho"]
lemme._variants[variant] = row["ortho"]
elif lemme.cat_gram == CatGram.ADJECTIF:
genres = match_enum_or_all(row["genre"], cls.Parsers.genre, Genre)
nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre)
for genre, nombre in itertools.product(genres, nombres):
lemme.variantes[(genre, nombre)] = row["ortho"]
# No need to match adverbs (invariant)
return cls(out) return cls(out)
def most_common( def most_common(
@ -240,3 +237,52 @@ class Lexique:
out = list(filter(lambda word: word.cat_gram == cat_gram, self.dataset)) out = list(filter(lambda word: word.cat_gram == cat_gram, self.dataset))
out.sort(key=lambda word: word.freq, reverse=True) out.sort(key=lambda word: word.freq, reverse=True)
return out[:threshold] return out[:threshold]
def word_db(self, thresholds: t.Optional[dict[CatGram, int]] = None) -> WordDb:
"""Convert to a WordDb"""
thresholds = thresholds or {}
noms = self.most_common(CatGram.NOM, thresholds.get(CatGram.NOM, None))
db_noms = [
Nom(
genre=t.cast(Genre, nom.genre), # not None for noms
sing=nom.variantes[(Nombre.SING,)],
plur=nom.variantes[(Nombre.PLUR,)],
)
for nom in noms
]
adjectifs = self.most_common(
CatGram.ADJECTIF, thresholds.get(CatGram.ADJECTIF, None)
)
db_adjectifs = [
Adjectif(
masc_sing=adj.variantes[(Genre.MASC, Nombre.SING)],
masc_plur=adj.variantes[(Genre.MASC, Nombre.PLUR)],
fem_sing=adj.variantes[(Genre.FEM, Nombre.SING)],
fem_plur=adj.variantes[(Genre.FEM, Nombre.PLUR)],
)
for adj in adjectifs
]
verbes = self.most_common(CatGram.VERBE, thresholds.get(CatGram.VERBE, None))
db_verbes = [
Verbe(
present_sing=verbe.variantes[(Temps.PRESENT, Nombre.SING)],
present_plur=verbe.variantes[(Temps.PRESENT, Nombre.PLUR)],
futur_sing=verbe.variantes[(Temps.FUTUR, Nombre.SING)],
futur_plur=verbe.variantes[(Temps.FUTUR, Nombre.PLUR)],
imparfait_sing=verbe.variantes[(Temps.IMPARFAIT, Nombre.SING)],
imparfait_plur=verbe.variantes[(Temps.IMPARFAIT, Nombre.PLUR)],
)
for verbe in verbes
]
adverbes = self.most_common(
CatGram.ADVERBE, thresholds.get(CatGram.ADVERBE, None)
)
db_adverbes = [Adverbe(adv=adv.mot) for adv in adverbes]
return WordDb(
noms=db_noms, adjectifs=db_adjectifs, verbes=db_verbes, adverbes=db_adverbes
)

132
pwgen_fr/word_db.py Normal file
View file

@ -0,0 +1,132 @@
""" A pre-processed database of words, independant of their source """
import typing as t
from enum import Enum
import json
class Genre(Enum):
MASC = "masculin"
FEM = "féminin"
class Nombre(Enum):
SING = "singulier"
PLUR = "pluriel"
class Temps(Enum):
PRESENT = "present"
FUTUR = "futur"
IMPARFAIT = "imparfait"
class Nom(t.NamedTuple):
"""Nom commun"""
genre: Genre
sing: str
plur: str
def __str__(self) -> str:
return f"{self.sing}"
def accord(self, nombre: Nombre) -> str:
"""Accorde en nombre"""
return getattr(self, nombre.name.lower())
class Adjectif(t.NamedTuple):
masc_sing: str
masc_plur: str
fem_sing: str
fem_plur: str
def __str__(self) -> str:
return f"{self.masc_sing}/{self.fem_sing}"
def accord(self, genre: Genre, nombre: Nombre) -> str:
"""Accorde en genre et en nombre"""
return getattr(self, f"{genre.name.lower()}_{nombre.name.lower()}")
class Verbe(t.NamedTuple):
present_sing: str
present_plur: str
futur_sing: str
futur_plur: str
imparfait_sing: str
imparfait_plur: str
def __str__(self) -> str:
return f"{self.present_sing}"
def accord(self, temps: Temps, nombre: Nombre) -> str:
"""Accorde en temps et en nombre (seule la 3è pers. est utilisée)"""
return getattr(self, f"{temps.name.lower()}_{nombre.name.lower()}")
class Adverbe(t.NamedTuple):
"""Packed as named tuple for consistence"""
adv: str
def __str__(self) -> str:
return self.adv
def accord(self) -> str:
"""for consistence"""
return self.adv
class WordDb:
"""Base de donnée de mots, sérialisable"""
_serialize_data: dict[str, t.Type[t.NamedTuple]] = {
"noms": Nom,
"adjectifs": Adjectif,
"verbes": Verbe,
"adverbes": Adverbe,
}
noms: list[Nom]
adjectifs: list[Adjectif]
verbes: list[Verbe]
adverbes: list[Adverbe]
def __init__(
self,
noms: t.Optional[list[Nom]] = None,
adjectifs: t.Optional[list[Adjectif]] = None,
verbes: t.Optional[list[Verbe]] = None,
adverbes: t.Optional[list[Adverbe]] = None,
):
self.noms = noms or []
self.adjectifs = adjectifs or []
self.verbes = verbes or []
self.adverbes = adverbes or []
def serialize(self) -> dict:
"""Serialize to plain dictionary (no classes)"""
return {
attr: [x._asdict() for x in getattr(self, attr)]
for attr in self.__class__._serialize_data
}
def save(self, fd):
"""Serialize to this stream"""
json.dump(self.serialize(), fd)
@classmethod
@t.no_type_check # serialization is messy
def unserialize(cls, data: dict) -> "WordDb":
"""Reverses :serialize:"""
parsed = {}
for attr, attr_cls in cls._serialize_data.items():
parsed[attr] = list(map(attr_cls, data[attr]))
return cls(**parsed)
@classmethod
def load(cls, fd) -> "WordDb":
"""Unserialize from this stream"""
return cls.unserialize(json.load(fd))