75 lines
2.4 KiB
Python
75 lines
2.4 KiB
Python
|
""" Generates a worddb based on Morphalou, but limits to frequent words based on
|
||
|
external sources (eg Lexique) """
|
||
|
|
||
|
import logging
|
||
|
import typing as t
|
||
|
|
||
|
from . import lexique, morphalou
|
||
|
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
|
||
|
|
||
|
logger = logging.getLogger(__name__)
|
||
|
|
||
|
|
||
|
class MorphalouFreqSet:
|
||
|
morphalou_db: WordDb
|
||
|
lexique: lexique.Lexique
|
||
|
filtered_db: WordDb
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
morphalou_db: t.Optional[WordDb] = None,
|
||
|
lexique: t.Optional[lexique.Lexique] = None,
|
||
|
):
|
||
|
if not morphalou_db:
|
||
|
morphalou_set = morphalou.MorphalouSet()
|
||
|
morphalou_set.parse()
|
||
|
self.morphalou_db = morphalou_set.word_db
|
||
|
else:
|
||
|
self.morphalou_db = morphalou_db
|
||
|
|
||
|
if not lexique:
|
||
|
self.lexique = lexique.Lexique.parse()
|
||
|
else:
|
||
|
self.lexique = lexique
|
||
|
|
||
|
self.filtered_db = self._filter_lexique()
|
||
|
|
||
|
def _filter_nom(self, nom: Nom) -> bool:
|
||
|
freq = max(
|
||
|
self.lexique.lemfreq.get(nom.sing, 0.0),
|
||
|
self.lexique.lemfreq.get(nom.plur, 0.0),
|
||
|
)
|
||
|
return freq > 0
|
||
|
|
||
|
def _filter_adjectif(self, adjectif: Adjectif) -> bool:
|
||
|
freq = max(
|
||
|
self.lexique.lemfreq.get(adjectif.masc_sing, 0.0),
|
||
|
self.lexique.lemfreq.get(adjectif.fem_sing, 0.0),
|
||
|
)
|
||
|
return freq > 0
|
||
|
|
||
|
def _filter_verbe(self, verbe: Verbe) -> bool:
|
||
|
freq = max(
|
||
|
self.lexique.lemfreq.get(verbe.present_sing, 0.0),
|
||
|
self.lexique.lemfreq.get(verbe.futur_sing, 0.0),
|
||
|
self.lexique.lemfreq.get(verbe.imparfait_sing, 0.0),
|
||
|
self.lexique.lemfreq.get(verbe.present_plur, 0.0),
|
||
|
self.lexique.lemfreq.get(verbe.futur_plur, 0.0),
|
||
|
self.lexique.lemfreq.get(verbe.imparfait_plur, 0.0),
|
||
|
)
|
||
|
return freq > 0
|
||
|
|
||
|
def _filter_adverbe(self, adverbe: Adverbe) -> bool:
|
||
|
if " " in adverbe.adv:
|
||
|
return False
|
||
|
freq = self.lexique.lemfreq.get(adverbe.adv, 0.0)
|
||
|
return freq > 0
|
||
|
|
||
|
def _filter_lexique(self) -> WordDb:
|
||
|
out = WordDb()
|
||
|
out.noms = list(filter(self._filter_nom, self.morphalou_db.noms))
|
||
|
out.adjectifs = list(filter(self._filter_adjectif, self.morphalou_db.adjectifs))
|
||
|
out.verbes = list(filter(self._filter_verbe, self.morphalou_db.verbes))
|
||
|
out.adverbes = list(filter(self._filter_adverbe, self.morphalou_db.adverbes))
|
||
|
return out
|