Morphalou: frequency filter based on Lexique

This commit is contained in:
Théophile Bastian 2024-09-23 19:30:40 +02:00
parent a872ecb0f9
commit 4f152d45f2
3 changed files with 90 additions and 4 deletions

View file

@ -90,9 +90,11 @@ class Lexique:
}
dataset: list[Mot]
lemfreq: dict[str, float]
def __init__(self, dataset):
def __init__(self, dataset, lemfreq):
self.dataset = dataset
self.lemfreq = lemfreq
@classmethod
def _ensure_uncompressed(cls):
@ -155,6 +157,8 @@ class Lexique:
def parse(cls) -> "Lexique":
out = []
rows = []
lemfreq: dict[str, float] = {}
with cls.LEXIQUE_PATH.open("r") as h:
reader = csv.DictReader(h, dialect="excel-tab")
for row in reader:
@ -188,6 +192,14 @@ class Lexique:
# Second pass: populate variants
for row in rows:
# Populate lemfreq
old_freq = lemfreq.get(row["ortho"], 0.0)
lemfreq[row["ortho"]] = max(
old_freq,
float(row["freqlemlivres"]),
float(row["freqlemfilms2"]),
)
lemme = cls._find_word(out, row)
if lemme is None:
continue
@ -221,7 +233,7 @@ class Lexique:
lemme.variantes[(genre, nombre)] = row["ortho"]
# No need to match adverbs (invariant)
return cls(out)
return cls(out, lemfreq)
def most_common(
self, cat_gram: CatGram, threshold: t.Optional[int] = None

View file

@ -0,0 +1,74 @@
""" Generates a worddb based on Morphalou, but limits to frequent words based on
external sources (eg Lexique) """
import logging
import typing as t
from . import lexique, morphalou
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
logger = logging.getLogger(__name__)
class MorphalouFreqSet:
morphalou_db: WordDb
lexique: lexique.Lexique
filtered_db: WordDb
def __init__(
self,
morphalou_db: t.Optional[WordDb] = None,
lexique: t.Optional[lexique.Lexique] = None,
):
if not morphalou_db:
morphalou_set = morphalou.MorphalouSet()
morphalou_set.parse()
self.morphalou_db = morphalou_set.word_db
else:
self.morphalou_db = morphalou_db
if not lexique:
self.lexique = lexique.Lexique.parse()
else:
self.lexique = lexique
self.filtered_db = self._filter_lexique()
def _filter_nom(self, nom: Nom) -> bool:
freq = max(
self.lexique.lemfreq.get(nom.sing, 0.0),
self.lexique.lemfreq.get(nom.plur, 0.0),
)
return freq > 0
def _filter_adjectif(self, adjectif: Adjectif) -> bool:
freq = max(
self.lexique.lemfreq.get(adjectif.masc_sing, 0.0),
self.lexique.lemfreq.get(adjectif.fem_sing, 0.0),
)
return freq > 0
def _filter_verbe(self, verbe: Verbe) -> bool:
freq = max(
self.lexique.lemfreq.get(verbe.present_sing, 0.0),
self.lexique.lemfreq.get(verbe.futur_sing, 0.0),
self.lexique.lemfreq.get(verbe.imparfait_sing, 0.0),
self.lexique.lemfreq.get(verbe.present_plur, 0.0),
self.lexique.lemfreq.get(verbe.futur_plur, 0.0),
self.lexique.lemfreq.get(verbe.imparfait_plur, 0.0),
)
return freq > 0
def _filter_adverbe(self, adverbe: Adverbe) -> bool:
if " " in adverbe.adv:
return False
freq = self.lexique.lemfreq.get(adverbe.adv, 0.0)
return freq > 0
def _filter_lexique(self) -> WordDb:
out = WordDb()
out.noms = list(filter(self._filter_nom, self.morphalou_db.noms))
out.adjectifs = list(filter(self._filter_adjectif, self.morphalou_db.adjectifs))
out.verbes = list(filter(self._filter_verbe, self.morphalou_db.verbes))
out.adverbes = list(filter(self._filter_adverbe, self.morphalou_db.adverbes))
return out

View file

@ -16,7 +16,7 @@ class Genre(Enum):
@classmethod
def pick(cls) -> "Genre":
"""random-pick (avoids inv)"""
return secrets.choice([cls.masc, cls.fem])
return secrets.choice([cls.MASC, cls.FEM])
class Nombre(Enum):
@ -141,7 +141,7 @@ class Adverbe(t.NamedTuple):
class WordDb:
"""Base de donnée de mots, sérialisable"""
SERIALIZED_GZ_LOCATION = Path(__file__).parent.parent / "morphalou_full.json.gz"
SERIALIZED_GZ_LOCATION = Path(__file__).parent.parent / "morphalou.json.gz"
_serialize_data: dict[str, t.Type[t.NamedTuple]] = {
"noms": Nom,