Parse morphalou as word_db

This commit is contained in:
Théophile Bastian 2024-09-14 01:06:51 +02:00
parent f3df51ae26
commit b086b9a08d
2 changed files with 226 additions and 0 deletions

218
pwgen_fr/morphalou.py Normal file
View file

@ -0,0 +1,218 @@
""" Reads the Morphalou dataset, in its TSV form """
import typing as t
from lxml import etree
from pathlib import Path
import itertools
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
TSV_NS = {
"tsv": "http://www.tei-c.org/ns/1.0",
"xml": "http://www.w3.org/XML/1998/namespace",
}
class MorphalouSet:
MORPHALOU_DIR_PATH = (
Path(__file__).parent.parent
/ "data/raw/morphalou/morphalou/5/Morphalou3.1_formatTEI"
)
MORPHALOU_FILENAME_TEMPLATE = "{cat_name}_Morphalou3.1_TEI.xml"
CAT_MAPPING: dict[t.Type[t.NamedTuple], str] = {
Nom: "commonNoun",
Adjectif: "adjective",
Verbe: "verb",
Adverbe: "adverb",
}
word_db: WordDb
def __init__(self):
self.word_db = WordDb()
def parse(self):
"""Parses the dataset"""
for cat, cat_file in self.__class__.CAT_MAPPING.items():
word_db_elt = WordDb.CATEGORY_TO_ATTR[cat]
setattr(
self.word_db,
word_db_elt,
getattr(self, f"_parse_{word_db_elt}")(
self.__class__.MORPHALOU_DIR_PATH
/ self.__class__.MORPHALOU_FILENAME_TEMPLATE.format(
cat_name=cat_file
)
),
)
def _tsv_elems(self, tsv_path: Path):
"""Opens a TSV file, and returns the <body> node, direct parent of all the
relevant nodes"""
with tsv_path.open("r") as h:
tree = etree.parse(h)
root = tree.getroot()
body = root.find("./tsv:text/tsv:body", TSV_NS)
return body
def _parse_noms(self, tsv_path: Path) -> list[Nom]:
"""Parse the nouns"""
root = self._tsv_elems(tsv_path)
out: list[Nom] = []
for entry in root.iterfind("./tsv:entry", TSV_NS):
try:
genre = self._genre(
entry.find(
"./tsv:form[@type='lemma']/tsv:gramGrp/tsv:gen", TSV_NS
).text
)
except AttributeError:
continue # some nouns don't have a gender defined, somehow -- ignore
forms = {}
for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
orth = inflected.find("./tsv:orth", TSV_NS).text
nombres = self._nombre_set(
inflected.find("./tsv:gramGrp/tsv:number", TSV_NS).text
)
for form in nombres:
forms[form] = orth
try:
out.append(
Nom(
genre=genre,
sing=forms[Nombre.SING],
plur=forms[Nombre.PLUR],
)
)
except KeyError:
continue # cannot be inflected to all required forms: skip
return out
def _parse_adjectifs(self, tsv_path: Path) -> list[Adjectif]:
"""Parse the adjectives"""
root = self._tsv_elems(tsv_path)
out: list[Adjectif] = []
for entry in root.iterfind("./tsv:entry", TSV_NS):
forms = {}
for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
orth = inflected.find("./tsv:orth", TSV_NS).text
gram_grp = inflected.find("./tsv:gramGrp", TSV_NS)
genres = self._genre_set(gram_grp.find("./tsv:gen", TSV_NS).text)
nombres = self._nombre_set(gram_grp.find("./tsv:number", TSV_NS).text)
for form in itertools.product(genres, nombres):
forms[form] = orth
try:
out.append(
Adjectif(
masc_sing=forms[Genre.MASC, Nombre.SING],
masc_plur=forms[Genre.MASC, Nombre.PLUR],
fem_sing=forms[Genre.FEM, Nombre.SING],
fem_plur=forms[Genre.FEM, Nombre.PLUR],
)
)
except KeyError:
continue # cannot be inflected to all required forms: skip
return out
def _parse_verbes(self, tsv_path: Path) -> list[Verbe]:
"""Parse the verbs"""
root = self._tsv_elems(tsv_path)
out: list[Verbe] = []
for entry in root.iterfind("./tsv:entry", TSV_NS):
forms = {}
for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
gram_grp = inflected.find("./tsv:gramGrp", TSV_NS)
# Order of tests is important! If mood == 'participle', there is no
# 'person' defined.
if (
gram_grp.find("./tsv:mood", TSV_NS).text != "indicative"
or gram_grp.find("./tsv:per", TSV_NS).text != "thirdPerson"
):
continue # irrelevant for us
temps = self._tense(gram_grp.find("./tsv:tns", TSV_NS).text)
if temps is None:
continue # irrelevant for us
nombres = self._nombre_set(gram_grp.find("./tsv:number", TSV_NS).text)
orth = inflected.find("./tsv:orth", TSV_NS).text
for nombre in nombres:
forms[(temps, nombre)] = orth
try:
out.append(
Verbe(
present_sing=forms[Temps.PRESENT, Nombre.SING],
present_plur=forms[Temps.PRESENT, Nombre.PLUR],
futur_sing=forms[Temps.FUTUR, Nombre.SING],
futur_plur=forms[Temps.FUTUR, Nombre.PLUR],
imparfait_sing=forms[Temps.IMPARFAIT, Nombre.SING],
imparfait_plur=forms[Temps.IMPARFAIT, Nombre.PLUR],
)
)
except KeyError:
continue # cannot be inflected to all required forms: skip
return out
def _parse_adverbes(self, tsv_path: Path) -> list[Adverbe]:
"""Parse the adverbs"""
root = self._tsv_elems(tsv_path)
out: list[Adverbe] = []
for entry in root.iterfind("./tsv:entry", TSV_NS):
# We're only interested in the lemma form
orth = entry.find("./tsv:form[@type='lemma']/tsv:orth", TSV_NS)
assert orth is not None
adv = orth.text
out.append(Adverbe(adv=adv))
return out
@staticmethod
def _genre_set(genre: str) -> list[Genre]:
return {
"masculine": [Genre.MASC],
"feminine": [Genre.FEM],
"invariable": [Genre.MASC, Genre.FEM],
}[genre]
@staticmethod
def _genre(genre: str) -> Genre:
return {
"masculine": Genre.MASC,
"feminine": Genre.FEM,
"invariable": Genre.INV,
}[genre]
@staticmethod
def _nombre(nombre: str) -> Nombre:
return {
"singular": Nombre.SING,
"plural": Nombre.PLUR,
}[nombre]
@staticmethod
def _nombre_set(nombre: str) -> list[Nombre]:
return {
"singular": [Nombre.SING],
"plural": [Nombre.PLUR],
"invariable": [Nombre.SING, Nombre.PLUR],
}[nombre]
@staticmethod
def _tense(tense: str) -> t.Optional[Temps]:
return {
"present": Temps.PRESENT,
"imperfect": Temps.IMPARFAIT,
"future": Temps.FUTUR,
}.get(tense, None)

View file

@ -8,6 +8,7 @@ import json
class Genre(Enum): class Genre(Enum):
MASC = "masculin" MASC = "masculin"
FEM = "féminin" FEM = "féminin"
INV = "invariable" # pour les noms uniquement
class Nombre(Enum): class Nombre(Enum):
@ -89,6 +90,13 @@ class WordDb:
"adverbes": Adverbe, "adverbes": Adverbe,
} }
CATEGORY_TO_ATTR: dict = {
Nom: "noms",
Adjectif: "adjectifs",
Verbe: "verbes",
Adverbe: "adverbes",
}
noms: list[Nom] noms: list[Nom]
adjectifs: list[Adjectif] adjectifs: list[Adjectif]
verbes: list[Verbe] verbes: list[Verbe]