diff --git a/pwgen_fr/lexique.py b/pwgen_fr/lexique.py index 9e04dbb..c25277a 100644 --- a/pwgen_fr/lexique.py +++ b/pwgen_fr/lexique.py @@ -90,9 +90,11 @@ class Lexique: } dataset: list[Mot] + lemfreq: dict[str, float] - def __init__(self, dataset): + def __init__(self, dataset, lemfreq): self.dataset = dataset + self.lemfreq = lemfreq @classmethod def _ensure_uncompressed(cls): @@ -155,6 +157,8 @@ class Lexique: def parse(cls) -> "Lexique": out = [] rows = [] + lemfreq: dict[str, float] = {} + with cls.LEXIQUE_PATH.open("r") as h: reader = csv.DictReader(h, dialect="excel-tab") for row in reader: @@ -188,6 +192,14 @@ class Lexique: # Second pass: populate variants for row in rows: + # Populate lemfreq + old_freq = lemfreq.get(row["ortho"], 0.0) + lemfreq[row["ortho"]] = max( + old_freq, + float(row["freqlemlivres"]), + float(row["freqlemfilms2"]), + ) + lemme = cls._find_word(out, row) if lemme is None: continue @@ -221,7 +233,7 @@ class Lexique: lemme.variantes[(genre, nombre)] = row["ortho"] # No need to match adverbs (invariant) - return cls(out) + return cls(out, lemfreq) def most_common( self, cat_gram: CatGram, threshold: t.Optional[int] = None diff --git a/pwgen_fr/morphalou_frequency.py b/pwgen_fr/morphalou_frequency.py new file mode 100644 index 0000000..9633f0f --- /dev/null +++ b/pwgen_fr/morphalou_frequency.py @@ -0,0 +1,74 @@ +""" Generates a worddb based on Morphalou, but limits to frequent words based on +external sources (eg Lexique) """ + +import logging +import typing as t + +from . import lexique, morphalou +from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb + +logger = logging.getLogger(__name__) + + +class MorphalouFreqSet: + morphalou_db: WordDb + lexique: lexique.Lexique + filtered_db: WordDb + + def __init__( + self, + morphalou_db: t.Optional[WordDb] = None, + lexique: t.Optional[lexique.Lexique] = None, + ): + if not morphalou_db: + morphalou_set = morphalou.MorphalouSet() + morphalou_set.parse() + self.morphalou_db = morphalou_set.word_db + else: + self.morphalou_db = morphalou_db + + if not lexique: + self.lexique = lexique.Lexique.parse() + else: + self.lexique = lexique + + self.filtered_db = self._filter_lexique() + + def _filter_nom(self, nom: Nom) -> bool: + freq = max( + self.lexique.lemfreq.get(nom.sing, 0.0), + self.lexique.lemfreq.get(nom.plur, 0.0), + ) + return freq > 0 + + def _filter_adjectif(self, adjectif: Adjectif) -> bool: + freq = max( + self.lexique.lemfreq.get(adjectif.masc_sing, 0.0), + self.lexique.lemfreq.get(adjectif.fem_sing, 0.0), + ) + return freq > 0 + + def _filter_verbe(self, verbe: Verbe) -> bool: + freq = max( + self.lexique.lemfreq.get(verbe.present_sing, 0.0), + self.lexique.lemfreq.get(verbe.futur_sing, 0.0), + self.lexique.lemfreq.get(verbe.imparfait_sing, 0.0), + self.lexique.lemfreq.get(verbe.present_plur, 0.0), + self.lexique.lemfreq.get(verbe.futur_plur, 0.0), + self.lexique.lemfreq.get(verbe.imparfait_plur, 0.0), + ) + return freq > 0 + + def _filter_adverbe(self, adverbe: Adverbe) -> bool: + if " " in adverbe.adv: + return False + freq = self.lexique.lemfreq.get(adverbe.adv, 0.0) + return freq > 0 + + def _filter_lexique(self) -> WordDb: + out = WordDb() + out.noms = list(filter(self._filter_nom, self.morphalou_db.noms)) + out.adjectifs = list(filter(self._filter_adjectif, self.morphalou_db.adjectifs)) + out.verbes = list(filter(self._filter_verbe, self.morphalou_db.verbes)) + out.adverbes = list(filter(self._filter_adverbe, self.morphalou_db.adverbes)) + return out diff --git a/pwgen_fr/word_db.py b/pwgen_fr/word_db.py index fd4ecac..288ef5b 100644 --- a/pwgen_fr/word_db.py +++ b/pwgen_fr/word_db.py @@ -16,7 +16,7 @@ class Genre(Enum): @classmethod def pick(cls) -> "Genre": """random-pick (avoids inv)""" - return secrets.choice([cls.masc, cls.fem]) + return secrets.choice([cls.MASC, cls.FEM]) class Nombre(Enum): @@ -141,7 +141,7 @@ class Adverbe(t.NamedTuple): class WordDb: """Base de donnée de mots, sérialisable""" - SERIALIZED_GZ_LOCATION = Path(__file__).parent.parent / "morphalou_full.json.gz" + SERIALIZED_GZ_LOCATION = Path(__file__).parent.parent / "morphalou.json.gz" _serialize_data: dict[str, t.Type[t.NamedTuple]] = { "noms": Nom,