Parse morphalou as word_db
This commit is contained in:
parent
f3df51ae26
commit
b086b9a08d
2 changed files with 226 additions and 0 deletions
218
pwgen_fr/morphalou.py
Normal file
218
pwgen_fr/morphalou.py
Normal file
|
@ -0,0 +1,218 @@
|
|||
""" Reads the Morphalou dataset, in its TSV form """
|
||||
|
||||
import typing as t
|
||||
from lxml import etree
|
||||
from pathlib import Path
|
||||
import itertools
|
||||
|
||||
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
|
||||
|
||||
TSV_NS = {
|
||||
"tsv": "http://www.tei-c.org/ns/1.0",
|
||||
"xml": "http://www.w3.org/XML/1998/namespace",
|
||||
}
|
||||
|
||||
|
||||
class MorphalouSet:
|
||||
MORPHALOU_DIR_PATH = (
|
||||
Path(__file__).parent.parent
|
||||
/ "data/raw/morphalou/morphalou/5/Morphalou3.1_formatTEI"
|
||||
)
|
||||
MORPHALOU_FILENAME_TEMPLATE = "{cat_name}_Morphalou3.1_TEI.xml"
|
||||
|
||||
CAT_MAPPING: dict[t.Type[t.NamedTuple], str] = {
|
||||
Nom: "commonNoun",
|
||||
Adjectif: "adjective",
|
||||
Verbe: "verb",
|
||||
Adverbe: "adverb",
|
||||
}
|
||||
|
||||
word_db: WordDb
|
||||
|
||||
def __init__(self):
|
||||
self.word_db = WordDb()
|
||||
|
||||
def parse(self):
|
||||
"""Parses the dataset"""
|
||||
for cat, cat_file in self.__class__.CAT_MAPPING.items():
|
||||
word_db_elt = WordDb.CATEGORY_TO_ATTR[cat]
|
||||
setattr(
|
||||
self.word_db,
|
||||
word_db_elt,
|
||||
getattr(self, f"_parse_{word_db_elt}")(
|
||||
self.__class__.MORPHALOU_DIR_PATH
|
||||
/ self.__class__.MORPHALOU_FILENAME_TEMPLATE.format(
|
||||
cat_name=cat_file
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
def _tsv_elems(self, tsv_path: Path):
|
||||
"""Opens a TSV file, and returns the <body> node, direct parent of all the
|
||||
relevant nodes"""
|
||||
with tsv_path.open("r") as h:
|
||||
tree = etree.parse(h)
|
||||
root = tree.getroot()
|
||||
body = root.find("./tsv:text/tsv:body", TSV_NS)
|
||||
return body
|
||||
|
||||
def _parse_noms(self, tsv_path: Path) -> list[Nom]:
|
||||
"""Parse the nouns"""
|
||||
root = self._tsv_elems(tsv_path)
|
||||
out: list[Nom] = []
|
||||
|
||||
for entry in root.iterfind("./tsv:entry", TSV_NS):
|
||||
try:
|
||||
genre = self._genre(
|
||||
entry.find(
|
||||
"./tsv:form[@type='lemma']/tsv:gramGrp/tsv:gen", TSV_NS
|
||||
).text
|
||||
)
|
||||
except AttributeError:
|
||||
continue # some nouns don't have a gender defined, somehow -- ignore
|
||||
|
||||
forms = {}
|
||||
for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
|
||||
orth = inflected.find("./tsv:orth", TSV_NS).text
|
||||
nombres = self._nombre_set(
|
||||
inflected.find("./tsv:gramGrp/tsv:number", TSV_NS).text
|
||||
)
|
||||
for form in nombres:
|
||||
forms[form] = orth
|
||||
try:
|
||||
out.append(
|
||||
Nom(
|
||||
genre=genre,
|
||||
sing=forms[Nombre.SING],
|
||||
plur=forms[Nombre.PLUR],
|
||||
)
|
||||
)
|
||||
except KeyError:
|
||||
continue # cannot be inflected to all required forms: skip
|
||||
|
||||
return out
|
||||
|
||||
def _parse_adjectifs(self, tsv_path: Path) -> list[Adjectif]:
|
||||
"""Parse the adjectives"""
|
||||
root = self._tsv_elems(tsv_path)
|
||||
out: list[Adjectif] = []
|
||||
|
||||
for entry in root.iterfind("./tsv:entry", TSV_NS):
|
||||
forms = {}
|
||||
for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
|
||||
orth = inflected.find("./tsv:orth", TSV_NS).text
|
||||
gram_grp = inflected.find("./tsv:gramGrp", TSV_NS)
|
||||
genres = self._genre_set(gram_grp.find("./tsv:gen", TSV_NS).text)
|
||||
nombres = self._nombre_set(gram_grp.find("./tsv:number", TSV_NS).text)
|
||||
|
||||
for form in itertools.product(genres, nombres):
|
||||
forms[form] = orth
|
||||
try:
|
||||
out.append(
|
||||
Adjectif(
|
||||
masc_sing=forms[Genre.MASC, Nombre.SING],
|
||||
masc_plur=forms[Genre.MASC, Nombre.PLUR],
|
||||
fem_sing=forms[Genre.FEM, Nombre.SING],
|
||||
fem_plur=forms[Genre.FEM, Nombre.PLUR],
|
||||
)
|
||||
)
|
||||
except KeyError:
|
||||
continue # cannot be inflected to all required forms: skip
|
||||
|
||||
return out
|
||||
|
||||
def _parse_verbes(self, tsv_path: Path) -> list[Verbe]:
|
||||
"""Parse the verbs"""
|
||||
root = self._tsv_elems(tsv_path)
|
||||
out: list[Verbe] = []
|
||||
|
||||
for entry in root.iterfind("./tsv:entry", TSV_NS):
|
||||
forms = {}
|
||||
for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
|
||||
gram_grp = inflected.find("./tsv:gramGrp", TSV_NS)
|
||||
|
||||
# Order of tests is important! If mood == 'participle', there is no
|
||||
# 'person' defined.
|
||||
if (
|
||||
gram_grp.find("./tsv:mood", TSV_NS).text != "indicative"
|
||||
or gram_grp.find("./tsv:per", TSV_NS).text != "thirdPerson"
|
||||
):
|
||||
continue # irrelevant for us
|
||||
|
||||
temps = self._tense(gram_grp.find("./tsv:tns", TSV_NS).text)
|
||||
if temps is None:
|
||||
continue # irrelevant for us
|
||||
|
||||
nombres = self._nombre_set(gram_grp.find("./tsv:number", TSV_NS).text)
|
||||
|
||||
orth = inflected.find("./tsv:orth", TSV_NS).text
|
||||
for nombre in nombres:
|
||||
forms[(temps, nombre)] = orth
|
||||
try:
|
||||
out.append(
|
||||
Verbe(
|
||||
present_sing=forms[Temps.PRESENT, Nombre.SING],
|
||||
present_plur=forms[Temps.PRESENT, Nombre.PLUR],
|
||||
futur_sing=forms[Temps.FUTUR, Nombre.SING],
|
||||
futur_plur=forms[Temps.FUTUR, Nombre.PLUR],
|
||||
imparfait_sing=forms[Temps.IMPARFAIT, Nombre.SING],
|
||||
imparfait_plur=forms[Temps.IMPARFAIT, Nombre.PLUR],
|
||||
)
|
||||
)
|
||||
except KeyError:
|
||||
continue # cannot be inflected to all required forms: skip
|
||||
|
||||
return out
|
||||
|
||||
def _parse_adverbes(self, tsv_path: Path) -> list[Adverbe]:
|
||||
"""Parse the adverbs"""
|
||||
root = self._tsv_elems(tsv_path)
|
||||
out: list[Adverbe] = []
|
||||
|
||||
for entry in root.iterfind("./tsv:entry", TSV_NS):
|
||||
# We're only interested in the lemma form
|
||||
orth = entry.find("./tsv:form[@type='lemma']/tsv:orth", TSV_NS)
|
||||
assert orth is not None
|
||||
adv = orth.text
|
||||
out.append(Adverbe(adv=adv))
|
||||
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
def _genre_set(genre: str) -> list[Genre]:
|
||||
return {
|
||||
"masculine": [Genre.MASC],
|
||||
"feminine": [Genre.FEM],
|
||||
"invariable": [Genre.MASC, Genre.FEM],
|
||||
}[genre]
|
||||
|
||||
@staticmethod
|
||||
def _genre(genre: str) -> Genre:
|
||||
return {
|
||||
"masculine": Genre.MASC,
|
||||
"feminine": Genre.FEM,
|
||||
"invariable": Genre.INV,
|
||||
}[genre]
|
||||
|
||||
@staticmethod
|
||||
def _nombre(nombre: str) -> Nombre:
|
||||
return {
|
||||
"singular": Nombre.SING,
|
||||
"plural": Nombre.PLUR,
|
||||
}[nombre]
|
||||
|
||||
@staticmethod
|
||||
def _nombre_set(nombre: str) -> list[Nombre]:
|
||||
return {
|
||||
"singular": [Nombre.SING],
|
||||
"plural": [Nombre.PLUR],
|
||||
"invariable": [Nombre.SING, Nombre.PLUR],
|
||||
}[nombre]
|
||||
|
||||
@staticmethod
|
||||
def _tense(tense: str) -> t.Optional[Temps]:
|
||||
return {
|
||||
"present": Temps.PRESENT,
|
||||
"imperfect": Temps.IMPARFAIT,
|
||||
"future": Temps.FUTUR,
|
||||
}.get(tense, None)
|
|
@ -8,6 +8,7 @@ import json
|
|||
class Genre(Enum):
|
||||
MASC = "masculin"
|
||||
FEM = "féminin"
|
||||
INV = "invariable" # pour les noms uniquement
|
||||
|
||||
|
||||
class Nombre(Enum):
|
||||
|
@ -89,6 +90,13 @@ class WordDb:
|
|||
"adverbes": Adverbe,
|
||||
}
|
||||
|
||||
CATEGORY_TO_ATTR: dict = {
|
||||
Nom: "noms",
|
||||
Adjectif: "adjectifs",
|
||||
Verbe: "verbes",
|
||||
Adverbe: "adverbes",
|
||||
}
|
||||
|
||||
noms: list[Nom]
|
||||
adjectifs: list[Adjectif]
|
||||
verbes: list[Verbe]
|
||||
|
|
Loading…
Reference in a new issue