Morphalou: frequency filter based on Lexique
This commit is contained in:
parent
a872ecb0f9
commit
4f152d45f2
3 changed files with 90 additions and 4 deletions
|
@ -90,9 +90,11 @@ class Lexique:
|
|||
}
|
||||
|
||||
dataset: list[Mot]
|
||||
lemfreq: dict[str, float]
|
||||
|
||||
def __init__(self, dataset):
|
||||
def __init__(self, dataset, lemfreq):
|
||||
self.dataset = dataset
|
||||
self.lemfreq = lemfreq
|
||||
|
||||
@classmethod
|
||||
def _ensure_uncompressed(cls):
|
||||
|
@ -155,6 +157,8 @@ class Lexique:
|
|||
def parse(cls) -> "Lexique":
|
||||
out = []
|
||||
rows = []
|
||||
lemfreq: dict[str, float] = {}
|
||||
|
||||
with cls.LEXIQUE_PATH.open("r") as h:
|
||||
reader = csv.DictReader(h, dialect="excel-tab")
|
||||
for row in reader:
|
||||
|
@ -188,6 +192,14 @@ class Lexique:
|
|||
|
||||
# Second pass: populate variants
|
||||
for row in rows:
|
||||
# Populate lemfreq
|
||||
old_freq = lemfreq.get(row["ortho"], 0.0)
|
||||
lemfreq[row["ortho"]] = max(
|
||||
old_freq,
|
||||
float(row["freqlemlivres"]),
|
||||
float(row["freqlemfilms2"]),
|
||||
)
|
||||
|
||||
lemme = cls._find_word(out, row)
|
||||
if lemme is None:
|
||||
continue
|
||||
|
@ -221,7 +233,7 @@ class Lexique:
|
|||
lemme.variantes[(genre, nombre)] = row["ortho"]
|
||||
|
||||
# No need to match adverbs (invariant)
|
||||
return cls(out)
|
||||
return cls(out, lemfreq)
|
||||
|
||||
def most_common(
|
||||
self, cat_gram: CatGram, threshold: t.Optional[int] = None
|
||||
|
|
74
pwgen_fr/morphalou_frequency.py
Normal file
74
pwgen_fr/morphalou_frequency.py
Normal file
|
@ -0,0 +1,74 @@
|
|||
""" Generates a worddb based on Morphalou, but limits to frequent words based on
|
||||
external sources (eg Lexique) """
|
||||
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
from . import lexique, morphalou
|
||||
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MorphalouFreqSet:
|
||||
morphalou_db: WordDb
|
||||
lexique: lexique.Lexique
|
||||
filtered_db: WordDb
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
morphalou_db: t.Optional[WordDb] = None,
|
||||
lexique: t.Optional[lexique.Lexique] = None,
|
||||
):
|
||||
if not morphalou_db:
|
||||
morphalou_set = morphalou.MorphalouSet()
|
||||
morphalou_set.parse()
|
||||
self.morphalou_db = morphalou_set.word_db
|
||||
else:
|
||||
self.morphalou_db = morphalou_db
|
||||
|
||||
if not lexique:
|
||||
self.lexique = lexique.Lexique.parse()
|
||||
else:
|
||||
self.lexique = lexique
|
||||
|
||||
self.filtered_db = self._filter_lexique()
|
||||
|
||||
def _filter_nom(self, nom: Nom) -> bool:
|
||||
freq = max(
|
||||
self.lexique.lemfreq.get(nom.sing, 0.0),
|
||||
self.lexique.lemfreq.get(nom.plur, 0.0),
|
||||
)
|
||||
return freq > 0
|
||||
|
||||
def _filter_adjectif(self, adjectif: Adjectif) -> bool:
|
||||
freq = max(
|
||||
self.lexique.lemfreq.get(adjectif.masc_sing, 0.0),
|
||||
self.lexique.lemfreq.get(adjectif.fem_sing, 0.0),
|
||||
)
|
||||
return freq > 0
|
||||
|
||||
def _filter_verbe(self, verbe: Verbe) -> bool:
|
||||
freq = max(
|
||||
self.lexique.lemfreq.get(verbe.present_sing, 0.0),
|
||||
self.lexique.lemfreq.get(verbe.futur_sing, 0.0),
|
||||
self.lexique.lemfreq.get(verbe.imparfait_sing, 0.0),
|
||||
self.lexique.lemfreq.get(verbe.present_plur, 0.0),
|
||||
self.lexique.lemfreq.get(verbe.futur_plur, 0.0),
|
||||
self.lexique.lemfreq.get(verbe.imparfait_plur, 0.0),
|
||||
)
|
||||
return freq > 0
|
||||
|
||||
def _filter_adverbe(self, adverbe: Adverbe) -> bool:
|
||||
if " " in adverbe.adv:
|
||||
return False
|
||||
freq = self.lexique.lemfreq.get(adverbe.adv, 0.0)
|
||||
return freq > 0
|
||||
|
||||
def _filter_lexique(self) -> WordDb:
|
||||
out = WordDb()
|
||||
out.noms = list(filter(self._filter_nom, self.morphalou_db.noms))
|
||||
out.adjectifs = list(filter(self._filter_adjectif, self.morphalou_db.adjectifs))
|
||||
out.verbes = list(filter(self._filter_verbe, self.morphalou_db.verbes))
|
||||
out.adverbes = list(filter(self._filter_adverbe, self.morphalou_db.adverbes))
|
||||
return out
|
|
@ -16,7 +16,7 @@ class Genre(Enum):
|
|||
@classmethod
|
||||
def pick(cls) -> "Genre":
|
||||
"""random-pick (avoids inv)"""
|
||||
return secrets.choice([cls.masc, cls.fem])
|
||||
return secrets.choice([cls.MASC, cls.FEM])
|
||||
|
||||
|
||||
class Nombre(Enum):
|
||||
|
@ -141,7 +141,7 @@ class Adverbe(t.NamedTuple):
|
|||
class WordDb:
|
||||
"""Base de donnée de mots, sérialisable"""
|
||||
|
||||
SERIALIZED_GZ_LOCATION = Path(__file__).parent.parent / "morphalou_full.json.gz"
|
||||
SERIALIZED_GZ_LOCATION = Path(__file__).parent.parent / "morphalou.json.gz"
|
||||
|
||||
_serialize_data: dict[str, t.Type[t.NamedTuple]] = {
|
||||
"noms": Nom,
|
||||
|
|
Loading…
Reference in a new issue