256 lines
8.6 KiB
Python
256 lines
8.6 KiB
Python
""" Reads the Morphalou dataset, in its TSV form """
|
|
|
|
import itertools
|
|
import logging
|
|
import subprocess
|
|
import typing as t
|
|
from pathlib import Path
|
|
|
|
from lxml import etree
|
|
|
|
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
|
|
|
|
TSV_NS = {
|
|
"tsv": "http://www.tei-c.org/ns/1.0",
|
|
"xml": "http://www.w3.org/XML/1998/namespace",
|
|
}
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class MorphalouSet:
|
|
MORPHALOU_DIR_PATH = (
|
|
Path(__file__).parent.parent / "data/raw/Morphalou3.1_formatTEI"
|
|
)
|
|
MORPHALOU_FILENAME_TEMPLATE = "{cat_name}_Morphalou3.1_TEI.xml"
|
|
|
|
CAT_MAPPING: dict[t.Type[t.NamedTuple], str] = {
|
|
Nom: "commonNoun",
|
|
Adjectif: "adjective",
|
|
Verbe: "verb",
|
|
Adverbe: "adverb",
|
|
}
|
|
|
|
word_db: WordDb
|
|
|
|
def __init__(self):
|
|
self.word_db = WordDb()
|
|
|
|
@classmethod
|
|
def _ensure_uncompressed(cls):
|
|
"""Ensures the dataset is uncompressed"""
|
|
if cls.MORPHALOU_DIR_PATH.exists():
|
|
return
|
|
|
|
lexique_archive = cls.MORPHALOU_DIR_PATH.with_suffix(".tar.xz")
|
|
if not lexique_archive.exists():
|
|
logger.error("Missing compressed dataset at %s", lexique_archive)
|
|
raise Exception(f"Missing compressed dataset at {lexique_archive}")
|
|
|
|
logger.info("Uncompressing dataset")
|
|
subprocess.check_call(
|
|
[
|
|
"tar",
|
|
"-xJf",
|
|
lexique_archive.as_posix(),
|
|
"-C",
|
|
lexique_archive.parent.as_posix(),
|
|
]
|
|
)
|
|
|
|
if not cls.MORPHALOU_DIR_PATH.exists():
|
|
logger.error(
|
|
"Uncompressed dataset still missing at %s after extraction",
|
|
cls.MORPHALOU_DIR_PATH,
|
|
)
|
|
raise Exception(
|
|
f"Uncompressed dataset still missing at {cls.MORPHALOU_DIR_PATH} after extraction"
|
|
)
|
|
|
|
def parse(self):
|
|
"""Parses the dataset"""
|
|
self.__class__._ensure_uncompressed()
|
|
|
|
for cat, cat_file in self.__class__.CAT_MAPPING.items():
|
|
word_db_elt = WordDb.CATEGORY_TO_ATTR[cat]
|
|
logging.info("Parsing %s...", word_db_elt)
|
|
setattr(
|
|
self.word_db,
|
|
word_db_elt,
|
|
getattr(self, f"_parse_{word_db_elt}")(
|
|
self.__class__.MORPHALOU_DIR_PATH
|
|
/ self.__class__.MORPHALOU_FILENAME_TEMPLATE.format(
|
|
cat_name=cat_file
|
|
)
|
|
),
|
|
)
|
|
|
|
def _tsv_elems(self, tsv_path: Path):
|
|
"""Opens a TSV file, and returns the <body> node, direct parent of all the
|
|
relevant nodes"""
|
|
with tsv_path.open("r") as h:
|
|
tree = etree.parse(h)
|
|
root = tree.getroot()
|
|
body = root.find("./tsv:text/tsv:body", TSV_NS)
|
|
return body
|
|
|
|
def _parse_noms(self, tsv_path: Path) -> list[Nom]:
|
|
"""Parse the nouns"""
|
|
root = self._tsv_elems(tsv_path)
|
|
out: list[Nom] = []
|
|
|
|
for entry in root.iterfind("./tsv:entry", TSV_NS):
|
|
try:
|
|
genre = self._genre(
|
|
entry.find(
|
|
"./tsv:form[@type='lemma']/tsv:gramGrp/tsv:gen", TSV_NS
|
|
).text
|
|
)
|
|
except AttributeError:
|
|
continue # some nouns don't have a gender defined, somehow -- ignore
|
|
|
|
forms = {}
|
|
for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
|
|
orth = inflected.find("./tsv:orth", TSV_NS).text
|
|
nombres = self._nombre_set(
|
|
inflected.find("./tsv:gramGrp/tsv:number", TSV_NS).text
|
|
)
|
|
for form in nombres:
|
|
forms[form] = orth
|
|
try:
|
|
out.append(
|
|
Nom(
|
|
genre=genre,
|
|
sing=forms[Nombre.SING],
|
|
plur=forms[Nombre.PLUR],
|
|
)
|
|
)
|
|
except KeyError:
|
|
continue # cannot be inflected to all required forms: skip
|
|
|
|
return out
|
|
|
|
def _parse_adjectifs(self, tsv_path: Path) -> list[Adjectif]:
|
|
"""Parse the adjectives"""
|
|
root = self._tsv_elems(tsv_path)
|
|
out: list[Adjectif] = []
|
|
|
|
for entry in root.iterfind("./tsv:entry", TSV_NS):
|
|
forms = {}
|
|
for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
|
|
orth = inflected.find("./tsv:orth", TSV_NS).text
|
|
gram_grp = inflected.find("./tsv:gramGrp", TSV_NS)
|
|
genres = self._genre_set(gram_grp.find("./tsv:gen", TSV_NS).text)
|
|
nombres = self._nombre_set(gram_grp.find("./tsv:number", TSV_NS).text)
|
|
|
|
for form in itertools.product(genres, nombres):
|
|
forms[form] = orth
|
|
try:
|
|
out.append(
|
|
Adjectif(
|
|
masc_sing=forms[Genre.MASC, Nombre.SING],
|
|
masc_plur=forms[Genre.MASC, Nombre.PLUR],
|
|
fem_sing=forms[Genre.FEM, Nombre.SING],
|
|
fem_plur=forms[Genre.FEM, Nombre.PLUR],
|
|
)
|
|
)
|
|
except KeyError:
|
|
continue # cannot be inflected to all required forms: skip
|
|
|
|
return out
|
|
|
|
def _parse_verbes(self, tsv_path: Path) -> list[Verbe]:
|
|
"""Parse the verbs"""
|
|
root = self._tsv_elems(tsv_path)
|
|
out: list[Verbe] = []
|
|
|
|
for entry in root.iterfind("./tsv:entry", TSV_NS):
|
|
forms = {}
|
|
for inflected in entry.iterfind("./tsv:form[@type='inflected']", TSV_NS):
|
|
gram_grp = inflected.find("./tsv:gramGrp", TSV_NS)
|
|
|
|
# Order of tests is important! If mood == 'participle', there is no
|
|
# 'person' defined.
|
|
if (
|
|
gram_grp.find("./tsv:mood", TSV_NS).text != "indicative"
|
|
or gram_grp.find("./tsv:per", TSV_NS).text != "thirdPerson"
|
|
):
|
|
continue # irrelevant for us
|
|
|
|
temps = self._tense(gram_grp.find("./tsv:tns", TSV_NS).text)
|
|
if temps is None:
|
|
continue # irrelevant for us
|
|
|
|
nombres = self._nombre_set(gram_grp.find("./tsv:number", TSV_NS).text)
|
|
|
|
orth = inflected.find("./tsv:orth", TSV_NS).text
|
|
for nombre in nombres:
|
|
forms[(temps, nombre)] = orth
|
|
try:
|
|
out.append(
|
|
Verbe(
|
|
present_sing=forms[Temps.PRESENT, Nombre.SING],
|
|
present_plur=forms[Temps.PRESENT, Nombre.PLUR],
|
|
futur_sing=forms[Temps.FUTUR, Nombre.SING],
|
|
futur_plur=forms[Temps.FUTUR, Nombre.PLUR],
|
|
imparfait_sing=forms[Temps.IMPARFAIT, Nombre.SING],
|
|
imparfait_plur=forms[Temps.IMPARFAIT, Nombre.PLUR],
|
|
)
|
|
)
|
|
except KeyError:
|
|
continue # cannot be inflected to all required forms: skip
|
|
|
|
return out
|
|
|
|
def _parse_adverbes(self, tsv_path: Path) -> list[Adverbe]:
|
|
"""Parse the adverbs"""
|
|
root = self._tsv_elems(tsv_path)
|
|
out: list[Adverbe] = []
|
|
|
|
for entry in root.iterfind("./tsv:entry", TSV_NS):
|
|
# We're only interested in the lemma form
|
|
orth = entry.find("./tsv:form[@type='lemma']/tsv:orth", TSV_NS)
|
|
assert orth is not None
|
|
adv = orth.text
|
|
out.append(Adverbe(adv=adv))
|
|
|
|
return out
|
|
|
|
@staticmethod
|
|
def _genre_set(genre: str) -> list[Genre]:
|
|
return {
|
|
"masculine": [Genre.MASC],
|
|
"feminine": [Genre.FEM],
|
|
"invariable": [Genre.MASC, Genre.FEM],
|
|
}[genre]
|
|
|
|
@staticmethod
|
|
def _genre(genre: str) -> Genre:
|
|
return {
|
|
"masculine": Genre.MASC,
|
|
"feminine": Genre.FEM,
|
|
"invariable": Genre.INV,
|
|
}[genre]
|
|
|
|
@staticmethod
|
|
def _nombre(nombre: str) -> Nombre:
|
|
return {
|
|
"singular": Nombre.SING,
|
|
"plural": Nombre.PLUR,
|
|
}[nombre]
|
|
|
|
@staticmethod
|
|
def _nombre_set(nombre: str) -> list[Nombre]:
|
|
return {
|
|
"singular": [Nombre.SING],
|
|
"plural": [Nombre.PLUR],
|
|
"invariable": [Nombre.SING, Nombre.PLUR],
|
|
}[nombre]
|
|
|
|
@staticmethod
|
|
def _tense(tense: str) -> t.Optional[Temps]:
|
|
return {
|
|
"present": Temps.PRESENT,
|
|
"imperfect": Temps.IMPARFAIT,
|
|
"future": Temps.FUTUR,
|
|
}.get(tense, None)
|