Compare commits
No commits in common. "4f152d45f232a2d3b679e43143210f738a5ec2bc" and "3c10d987e66bf56d1ca4df75135ac6185e9febc4" have entirely different histories.
4f152d45f2
...
3c10d987e6
7 changed files with 8 additions and 162 deletions
1
data/raw/.gitignore
vendored
1
data/raw/.gitignore
vendored
|
@ -1,2 +1 @@
|
|||
Lexique383
|
||||
Morphalou3.1_formatTEI
|
||||
|
|
Binary file not shown.
|
@ -1,29 +0,0 @@
|
|||
# Versions réduites de jeux de données
|
||||
|
||||
Les fichiers dans ce dossier sont des versions réduites de jeux de données
|
||||
tiers.
|
||||
|
||||
Veillez à respecter les licences respectives de ces ressources dans les usages
|
||||
que vous en faites.
|
||||
|
||||
## Lexique
|
||||
|
||||
La base de données Lexique (http://www.lexique.org/) est le travail, entre
|
||||
autres contributeurs et contributrices, de Boris New et Christophe Pallier,
|
||||
sous licence CC BY-NC
|
||||
|
||||
Le fichier présent ici est une version tronquée de la v3.83. Il ne conserve que
|
||||
la partie utile au présent logiciel. Le jeu de données entier est disponible
|
||||
sur leur site.
|
||||
|
||||
## Morphalou
|
||||
|
||||
La base de données Morphalou
|
||||
(https://www.ortolang.fr/market/lexicons/morphalou/v3.1) est le travail, entre
|
||||
autres contributeurs et contributrices, de Sandrine Ollinger, Christophe
|
||||
Benzitoun, Evelyne Jacquey, Ulrike Fleury, Etienne Petitjean et Marie
|
||||
Tonnelier. Sa version 3.1 est distribuée sous licence LGPL-LR.
|
||||
|
||||
Le fichier présent ici est une version tronquée de la v3.1. Il ne conserve que
|
||||
la partie utile au présent logiciel. Le jeu de données entier est disponible
|
||||
sur leur site.
|
|
@ -90,11 +90,9 @@ class Lexique:
|
|||
}
|
||||
|
||||
dataset: list[Mot]
|
||||
lemfreq: dict[str, float]
|
||||
|
||||
def __init__(self, dataset, lemfreq):
|
||||
def __init__(self, dataset):
|
||||
self.dataset = dataset
|
||||
self.lemfreq = lemfreq
|
||||
|
||||
@classmethod
|
||||
def _ensure_uncompressed(cls):
|
||||
|
@ -157,8 +155,6 @@ class Lexique:
|
|||
def parse(cls) -> "Lexique":
|
||||
out = []
|
||||
rows = []
|
||||
lemfreq: dict[str, float] = {}
|
||||
|
||||
with cls.LEXIQUE_PATH.open("r") as h:
|
||||
reader = csv.DictReader(h, dialect="excel-tab")
|
||||
for row in reader:
|
||||
|
@ -192,14 +188,6 @@ class Lexique:
|
|||
|
||||
# Second pass: populate variants
|
||||
for row in rows:
|
||||
# Populate lemfreq
|
||||
old_freq = lemfreq.get(row["ortho"], 0.0)
|
||||
lemfreq[row["ortho"]] = max(
|
||||
old_freq,
|
||||
float(row["freqlemlivres"]),
|
||||
float(row["freqlemfilms2"]),
|
||||
)
|
||||
|
||||
lemme = cls._find_word(out, row)
|
||||
if lemme is None:
|
||||
continue
|
||||
|
@ -233,7 +221,7 @@ class Lexique:
|
|||
lemme.variantes[(genre, nombre)] = row["ortho"]
|
||||
|
||||
# No need to match adverbs (invariant)
|
||||
return cls(out, lemfreq)
|
||||
return cls(out)
|
||||
|
||||
def most_common(
|
||||
self, cat_gram: CatGram, threshold: t.Optional[int] = None
|
||||
|
|
|
@ -1,12 +1,9 @@
|
|||
""" Reads the Morphalou dataset, in its TSV form """
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
import subprocess
|
||||
import typing as t
|
||||
from pathlib import Path
|
||||
|
||||
from lxml import etree
|
||||
from pathlib import Path
|
||||
import itertools
|
||||
|
||||
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
|
||||
|
||||
|
@ -15,12 +12,11 @@ TSV_NS = {
|
|||
"xml": "http://www.w3.org/XML/1998/namespace",
|
||||
}
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MorphalouSet:
|
||||
MORPHALOU_DIR_PATH = (
|
||||
Path(__file__).parent.parent / "data/raw/Morphalou3.1_formatTEI"
|
||||
Path(__file__).parent.parent
|
||||
/ "data/raw/morphalou/morphalou/5/Morphalou3.1_formatTEI"
|
||||
)
|
||||
MORPHALOU_FILENAME_TEMPLATE = "{cat_name}_Morphalou3.1_TEI.xml"
|
||||
|
||||
|
@ -36,44 +32,10 @@ class MorphalouSet:
|
|||
def __init__(self):
|
||||
self.word_db = WordDb()
|
||||
|
||||
@classmethod
|
||||
def _ensure_uncompressed(cls):
|
||||
"""Ensures the dataset is uncompressed"""
|
||||
if cls.MORPHALOU_DIR_PATH.exists():
|
||||
return
|
||||
|
||||
lexique_archive = cls.MORPHALOU_DIR_PATH.with_suffix(".tar.xz")
|
||||
if not lexique_archive.exists():
|
||||
logger.error("Missing compressed dataset at %s", lexique_archive)
|
||||
raise Exception(f"Missing compressed dataset at {lexique_archive}")
|
||||
|
||||
logger.info("Uncompressing dataset")
|
||||
subprocess.check_call(
|
||||
[
|
||||
"tar",
|
||||
"-xJf",
|
||||
lexique_archive.as_posix(),
|
||||
"-C",
|
||||
lexique_archive.parent.as_posix(),
|
||||
]
|
||||
)
|
||||
|
||||
if not cls.MORPHALOU_DIR_PATH.exists():
|
||||
logger.error(
|
||||
"Uncompressed dataset still missing at %s after extraction",
|
||||
cls.MORPHALOU_DIR_PATH,
|
||||
)
|
||||
raise Exception(
|
||||
f"Uncompressed dataset still missing at {cls.MORPHALOU_DIR_PATH} after extraction"
|
||||
)
|
||||
|
||||
def parse(self):
|
||||
"""Parses the dataset"""
|
||||
self.__class__._ensure_uncompressed()
|
||||
|
||||
for cat, cat_file in self.__class__.CAT_MAPPING.items():
|
||||
word_db_elt = WordDb.CATEGORY_TO_ATTR[cat]
|
||||
logging.info("Parsing %s...", word_db_elt)
|
||||
setattr(
|
||||
self.word_db,
|
||||
word_db_elt,
|
||||
|
|
|
@ -1,74 +0,0 @@
|
|||
""" Generates a worddb based on Morphalou, but limits to frequent words based on
|
||||
external sources (eg Lexique) """
|
||||
|
||||
import logging
|
||||
import typing as t
|
||||
|
||||
from . import lexique, morphalou
|
||||
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MorphalouFreqSet:
|
||||
morphalou_db: WordDb
|
||||
lexique: lexique.Lexique
|
||||
filtered_db: WordDb
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
morphalou_db: t.Optional[WordDb] = None,
|
||||
lexique: t.Optional[lexique.Lexique] = None,
|
||||
):
|
||||
if not morphalou_db:
|
||||
morphalou_set = morphalou.MorphalouSet()
|
||||
morphalou_set.parse()
|
||||
self.morphalou_db = morphalou_set.word_db
|
||||
else:
|
||||
self.morphalou_db = morphalou_db
|
||||
|
||||
if not lexique:
|
||||
self.lexique = lexique.Lexique.parse()
|
||||
else:
|
||||
self.lexique = lexique
|
||||
|
||||
self.filtered_db = self._filter_lexique()
|
||||
|
||||
def _filter_nom(self, nom: Nom) -> bool:
|
||||
freq = max(
|
||||
self.lexique.lemfreq.get(nom.sing, 0.0),
|
||||
self.lexique.lemfreq.get(nom.plur, 0.0),
|
||||
)
|
||||
return freq > 0
|
||||
|
||||
def _filter_adjectif(self, adjectif: Adjectif) -> bool:
|
||||
freq = max(
|
||||
self.lexique.lemfreq.get(adjectif.masc_sing, 0.0),
|
||||
self.lexique.lemfreq.get(adjectif.fem_sing, 0.0),
|
||||
)
|
||||
return freq > 0
|
||||
|
||||
def _filter_verbe(self, verbe: Verbe) -> bool:
|
||||
freq = max(
|
||||
self.lexique.lemfreq.get(verbe.present_sing, 0.0),
|
||||
self.lexique.lemfreq.get(verbe.futur_sing, 0.0),
|
||||
self.lexique.lemfreq.get(verbe.imparfait_sing, 0.0),
|
||||
self.lexique.lemfreq.get(verbe.present_plur, 0.0),
|
||||
self.lexique.lemfreq.get(verbe.futur_plur, 0.0),
|
||||
self.lexique.lemfreq.get(verbe.imparfait_plur, 0.0),
|
||||
)
|
||||
return freq > 0
|
||||
|
||||
def _filter_adverbe(self, adverbe: Adverbe) -> bool:
|
||||
if " " in adverbe.adv:
|
||||
return False
|
||||
freq = self.lexique.lemfreq.get(adverbe.adv, 0.0)
|
||||
return freq > 0
|
||||
|
||||
def _filter_lexique(self) -> WordDb:
|
||||
out = WordDb()
|
||||
out.noms = list(filter(self._filter_nom, self.morphalou_db.noms))
|
||||
out.adjectifs = list(filter(self._filter_adjectif, self.morphalou_db.adjectifs))
|
||||
out.verbes = list(filter(self._filter_verbe, self.morphalou_db.verbes))
|
||||
out.adverbes = list(filter(self._filter_adverbe, self.morphalou_db.adverbes))
|
||||
return out
|
|
@ -16,7 +16,7 @@ class Genre(Enum):
|
|||
@classmethod
|
||||
def pick(cls) -> "Genre":
|
||||
"""random-pick (avoids inv)"""
|
||||
return secrets.choice([cls.MASC, cls.FEM])
|
||||
return secrets.choice([cls.masc, cls.fem])
|
||||
|
||||
|
||||
class Nombre(Enum):
|
||||
|
@ -141,7 +141,7 @@ class Adverbe(t.NamedTuple):
|
|||
class WordDb:
|
||||
"""Base de donnée de mots, sérialisable"""
|
||||
|
||||
SERIALIZED_GZ_LOCATION = Path(__file__).parent.parent / "morphalou.json.gz"
|
||||
SERIALIZED_GZ_LOCATION = Path(__file__).parent.parent / "morphalou_full.json.gz"
|
||||
|
||||
_serialize_data: dict[str, t.Type[t.NamedTuple]] = {
|
||||
"noms": Nom,
|
||||
|
|
Loading…
Reference in a new issue