Compare commits
3 commits
3c10d987e6
...
4f152d45f2
Author | SHA1 | Date | |
---|---|---|---|
4f152d45f2 | |||
a872ecb0f9 | |||
e8379656e1 |
7 changed files with 163 additions and 9 deletions
1
data/raw/.gitignore
vendored
1
data/raw/.gitignore
vendored
|
@ -1 +1,2 @@
|
||||||
Lexique383
|
Lexique383
|
||||||
|
Morphalou3.1_formatTEI
|
||||||
|
|
BIN
data/raw/Morphalou3.1_formatTEI.tar.xz
Normal file
BIN
data/raw/Morphalou3.1_formatTEI.tar.xz
Normal file
Binary file not shown.
29
data/raw/README.md
Normal file
29
data/raw/README.md
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
# Versions réduites de jeux de données
|
||||||
|
|
||||||
|
Les fichiers dans ce dossier sont des versions réduites de jeux de données
|
||||||
|
tiers.
|
||||||
|
|
||||||
|
Veillez à respecter les licences respectives de ces ressources dans les usages
|
||||||
|
que vous en faites.
|
||||||
|
|
||||||
|
## Lexique
|
||||||
|
|
||||||
|
La base de données Lexique (http://www.lexique.org/) est le travail, entre
|
||||||
|
autres contributeurs et contributrices, de Boris New et Christophe Pallier,
|
||||||
|
sous licence CC BY-NC
|
||||||
|
|
||||||
|
Le fichier présent ici est une version tronquée de la v3.83. Il ne conserve que
|
||||||
|
la partie utile au présent logiciel. Le jeu de données entier est disponible
|
||||||
|
sur leur site.
|
||||||
|
|
||||||
|
## Morphalou
|
||||||
|
|
||||||
|
La base de données Morphalou
|
||||||
|
(https://www.ortolang.fr/market/lexicons/morphalou/v3.1) est le travail, entre
|
||||||
|
autres contributeurs et contributrices, de Sandrine Ollinger, Christophe
|
||||||
|
Benzitoun, Evelyne Jacquey, Ulrike Fleury, Etienne Petitjean et Marie
|
||||||
|
Tonnelier. Sa version 3.1 est distribuée sous licence LGPL-LR.
|
||||||
|
|
||||||
|
Le fichier présent ici est une version tronquée de la v3.1. Il ne conserve que
|
||||||
|
la partie utile au présent logiciel. Le jeu de données entier est disponible
|
||||||
|
sur leur site.
|
|
@ -90,9 +90,11 @@ class Lexique:
|
||||||
}
|
}
|
||||||
|
|
||||||
dataset: list[Mot]
|
dataset: list[Mot]
|
||||||
|
lemfreq: dict[str, float]
|
||||||
|
|
||||||
def __init__(self, dataset):
|
def __init__(self, dataset, lemfreq):
|
||||||
self.dataset = dataset
|
self.dataset = dataset
|
||||||
|
self.lemfreq = lemfreq
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _ensure_uncompressed(cls):
|
def _ensure_uncompressed(cls):
|
||||||
|
@ -155,6 +157,8 @@ class Lexique:
|
||||||
def parse(cls) -> "Lexique":
|
def parse(cls) -> "Lexique":
|
||||||
out = []
|
out = []
|
||||||
rows = []
|
rows = []
|
||||||
|
lemfreq: dict[str, float] = {}
|
||||||
|
|
||||||
with cls.LEXIQUE_PATH.open("r") as h:
|
with cls.LEXIQUE_PATH.open("r") as h:
|
||||||
reader = csv.DictReader(h, dialect="excel-tab")
|
reader = csv.DictReader(h, dialect="excel-tab")
|
||||||
for row in reader:
|
for row in reader:
|
||||||
|
@ -188,6 +192,14 @@ class Lexique:
|
||||||
|
|
||||||
# Second pass: populate variants
|
# Second pass: populate variants
|
||||||
for row in rows:
|
for row in rows:
|
||||||
|
# Populate lemfreq
|
||||||
|
old_freq = lemfreq.get(row["ortho"], 0.0)
|
||||||
|
lemfreq[row["ortho"]] = max(
|
||||||
|
old_freq,
|
||||||
|
float(row["freqlemlivres"]),
|
||||||
|
float(row["freqlemfilms2"]),
|
||||||
|
)
|
||||||
|
|
||||||
lemme = cls._find_word(out, row)
|
lemme = cls._find_word(out, row)
|
||||||
if lemme is None:
|
if lemme is None:
|
||||||
continue
|
continue
|
||||||
|
@ -221,7 +233,7 @@ class Lexique:
|
||||||
lemme.variantes[(genre, nombre)] = row["ortho"]
|
lemme.variantes[(genre, nombre)] = row["ortho"]
|
||||||
|
|
||||||
# No need to match adverbs (invariant)
|
# No need to match adverbs (invariant)
|
||||||
return cls(out)
|
return cls(out, lemfreq)
|
||||||
|
|
||||||
def most_common(
|
def most_common(
|
||||||
self, cat_gram: CatGram, threshold: t.Optional[int] = None
|
self, cat_gram: CatGram, threshold: t.Optional[int] = None
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
""" Reads the Morphalou dataset, in its TSV form """
|
""" Reads the Morphalou dataset, in its TSV form """
|
||||||
|
|
||||||
import typing as t
|
|
||||||
from lxml import etree
|
|
||||||
from pathlib import Path
|
|
||||||
import itertools
|
import itertools
|
||||||
|
import logging
|
||||||
|
import subprocess
|
||||||
|
import typing as t
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
|
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
|
||||||
|
|
||||||
|
@ -12,11 +15,12 @@ TSV_NS = {
|
||||||
"xml": "http://www.w3.org/XML/1998/namespace",
|
"xml": "http://www.w3.org/XML/1998/namespace",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class MorphalouSet:
|
class MorphalouSet:
|
||||||
MORPHALOU_DIR_PATH = (
|
MORPHALOU_DIR_PATH = (
|
||||||
Path(__file__).parent.parent
|
Path(__file__).parent.parent / "data/raw/Morphalou3.1_formatTEI"
|
||||||
/ "data/raw/morphalou/morphalou/5/Morphalou3.1_formatTEI"
|
|
||||||
)
|
)
|
||||||
MORPHALOU_FILENAME_TEMPLATE = "{cat_name}_Morphalou3.1_TEI.xml"
|
MORPHALOU_FILENAME_TEMPLATE = "{cat_name}_Morphalou3.1_TEI.xml"
|
||||||
|
|
||||||
|
@ -32,10 +36,44 @@ class MorphalouSet:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.word_db = WordDb()
|
self.word_db = WordDb()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _ensure_uncompressed(cls):
|
||||||
|
"""Ensures the dataset is uncompressed"""
|
||||||
|
if cls.MORPHALOU_DIR_PATH.exists():
|
||||||
|
return
|
||||||
|
|
||||||
|
lexique_archive = cls.MORPHALOU_DIR_PATH.with_suffix(".tar.xz")
|
||||||
|
if not lexique_archive.exists():
|
||||||
|
logger.error("Missing compressed dataset at %s", lexique_archive)
|
||||||
|
raise Exception(f"Missing compressed dataset at {lexique_archive}")
|
||||||
|
|
||||||
|
logger.info("Uncompressing dataset")
|
||||||
|
subprocess.check_call(
|
||||||
|
[
|
||||||
|
"tar",
|
||||||
|
"-xJf",
|
||||||
|
lexique_archive.as_posix(),
|
||||||
|
"-C",
|
||||||
|
lexique_archive.parent.as_posix(),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
if not cls.MORPHALOU_DIR_PATH.exists():
|
||||||
|
logger.error(
|
||||||
|
"Uncompressed dataset still missing at %s after extraction",
|
||||||
|
cls.MORPHALOU_DIR_PATH,
|
||||||
|
)
|
||||||
|
raise Exception(
|
||||||
|
f"Uncompressed dataset still missing at {cls.MORPHALOU_DIR_PATH} after extraction"
|
||||||
|
)
|
||||||
|
|
||||||
def parse(self):
|
def parse(self):
|
||||||
"""Parses the dataset"""
|
"""Parses the dataset"""
|
||||||
|
self.__class__._ensure_uncompressed()
|
||||||
|
|
||||||
for cat, cat_file in self.__class__.CAT_MAPPING.items():
|
for cat, cat_file in self.__class__.CAT_MAPPING.items():
|
||||||
word_db_elt = WordDb.CATEGORY_TO_ATTR[cat]
|
word_db_elt = WordDb.CATEGORY_TO_ATTR[cat]
|
||||||
|
logging.info("Parsing %s...", word_db_elt)
|
||||||
setattr(
|
setattr(
|
||||||
self.word_db,
|
self.word_db,
|
||||||
word_db_elt,
|
word_db_elt,
|
||||||
|
|
74
pwgen_fr/morphalou_frequency.py
Normal file
74
pwgen_fr/morphalou_frequency.py
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
""" Generates a worddb based on Morphalou, but limits to frequent words based on
|
||||||
|
external sources (eg Lexique) """
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import typing as t
|
||||||
|
|
||||||
|
from . import lexique, morphalou
|
||||||
|
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class MorphalouFreqSet:
|
||||||
|
morphalou_db: WordDb
|
||||||
|
lexique: lexique.Lexique
|
||||||
|
filtered_db: WordDb
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
morphalou_db: t.Optional[WordDb] = None,
|
||||||
|
lexique: t.Optional[lexique.Lexique] = None,
|
||||||
|
):
|
||||||
|
if not morphalou_db:
|
||||||
|
morphalou_set = morphalou.MorphalouSet()
|
||||||
|
morphalou_set.parse()
|
||||||
|
self.morphalou_db = morphalou_set.word_db
|
||||||
|
else:
|
||||||
|
self.morphalou_db = morphalou_db
|
||||||
|
|
||||||
|
if not lexique:
|
||||||
|
self.lexique = lexique.Lexique.parse()
|
||||||
|
else:
|
||||||
|
self.lexique = lexique
|
||||||
|
|
||||||
|
self.filtered_db = self._filter_lexique()
|
||||||
|
|
||||||
|
def _filter_nom(self, nom: Nom) -> bool:
|
||||||
|
freq = max(
|
||||||
|
self.lexique.lemfreq.get(nom.sing, 0.0),
|
||||||
|
self.lexique.lemfreq.get(nom.plur, 0.0),
|
||||||
|
)
|
||||||
|
return freq > 0
|
||||||
|
|
||||||
|
def _filter_adjectif(self, adjectif: Adjectif) -> bool:
|
||||||
|
freq = max(
|
||||||
|
self.lexique.lemfreq.get(adjectif.masc_sing, 0.0),
|
||||||
|
self.lexique.lemfreq.get(adjectif.fem_sing, 0.0),
|
||||||
|
)
|
||||||
|
return freq > 0
|
||||||
|
|
||||||
|
def _filter_verbe(self, verbe: Verbe) -> bool:
|
||||||
|
freq = max(
|
||||||
|
self.lexique.lemfreq.get(verbe.present_sing, 0.0),
|
||||||
|
self.lexique.lemfreq.get(verbe.futur_sing, 0.0),
|
||||||
|
self.lexique.lemfreq.get(verbe.imparfait_sing, 0.0),
|
||||||
|
self.lexique.lemfreq.get(verbe.present_plur, 0.0),
|
||||||
|
self.lexique.lemfreq.get(verbe.futur_plur, 0.0),
|
||||||
|
self.lexique.lemfreq.get(verbe.imparfait_plur, 0.0),
|
||||||
|
)
|
||||||
|
return freq > 0
|
||||||
|
|
||||||
|
def _filter_adverbe(self, adverbe: Adverbe) -> bool:
|
||||||
|
if " " in adverbe.adv:
|
||||||
|
return False
|
||||||
|
freq = self.lexique.lemfreq.get(adverbe.adv, 0.0)
|
||||||
|
return freq > 0
|
||||||
|
|
||||||
|
def _filter_lexique(self) -> WordDb:
|
||||||
|
out = WordDb()
|
||||||
|
out.noms = list(filter(self._filter_nom, self.morphalou_db.noms))
|
||||||
|
out.adjectifs = list(filter(self._filter_adjectif, self.morphalou_db.adjectifs))
|
||||||
|
out.verbes = list(filter(self._filter_verbe, self.morphalou_db.verbes))
|
||||||
|
out.adverbes = list(filter(self._filter_adverbe, self.morphalou_db.adverbes))
|
||||||
|
return out
|
|
@ -16,7 +16,7 @@ class Genre(Enum):
|
||||||
@classmethod
|
@classmethod
|
||||||
def pick(cls) -> "Genre":
|
def pick(cls) -> "Genre":
|
||||||
"""random-pick (avoids inv)"""
|
"""random-pick (avoids inv)"""
|
||||||
return secrets.choice([cls.masc, cls.fem])
|
return secrets.choice([cls.MASC, cls.FEM])
|
||||||
|
|
||||||
|
|
||||||
class Nombre(Enum):
|
class Nombre(Enum):
|
||||||
|
@ -141,7 +141,7 @@ class Adverbe(t.NamedTuple):
|
||||||
class WordDb:
|
class WordDb:
|
||||||
"""Base de donnée de mots, sérialisable"""
|
"""Base de donnée de mots, sérialisable"""
|
||||||
|
|
||||||
SERIALIZED_GZ_LOCATION = Path(__file__).parent.parent / "morphalou_full.json.gz"
|
SERIALIZED_GZ_LOCATION = Path(__file__).parent.parent / "morphalou.json.gz"
|
||||||
|
|
||||||
_serialize_data: dict[str, t.Type[t.NamedTuple]] = {
|
_serialize_data: dict[str, t.Type[t.NamedTuple]] = {
|
||||||
"noms": Nom,
|
"noms": Nom,
|
||||||
|
|
Loading…
Reference in a new issue