Morphalou: use new dataset location, uncompress

This commit is contained in:
Théophile Bastian 2024-09-19 21:06:08 +02:00
parent e8379656e1
commit a872ecb0f9

View file

@ -1,9 +1,12 @@
""" Reads the Morphalou dataset, in its TSV form """
import typing as t
from lxml import etree
from pathlib import Path
import itertools
import logging
import subprocess
import typing as t
from pathlib import Path
from lxml import etree
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
@ -12,11 +15,12 @@ TSV_NS = {
"xml": "http://www.w3.org/XML/1998/namespace",
}
logger = logging.getLogger(__name__)
class MorphalouSet:
MORPHALOU_DIR_PATH = (
Path(__file__).parent.parent
/ "data/raw/morphalou/morphalou/5/Morphalou3.1_formatTEI"
Path(__file__).parent.parent / "data/raw/Morphalou3.1_formatTEI"
)
MORPHALOU_FILENAME_TEMPLATE = "{cat_name}_Morphalou3.1_TEI.xml"
@ -32,10 +36,44 @@ class MorphalouSet:
def __init__(self):
self.word_db = WordDb()
@classmethod
def _ensure_uncompressed(cls):
"""Ensures the dataset is uncompressed"""
if cls.MORPHALOU_DIR_PATH.exists():
return
lexique_archive = cls.MORPHALOU_DIR_PATH.with_suffix(".tar.xz")
if not lexique_archive.exists():
logger.error("Missing compressed dataset at %s", lexique_archive)
raise Exception(f"Missing compressed dataset at {lexique_archive}")
logger.info("Uncompressing dataset")
subprocess.check_call(
[
"tar",
"-xJf",
lexique_archive.as_posix(),
"-C",
lexique_archive.parent.as_posix(),
]
)
if not cls.MORPHALOU_DIR_PATH.exists():
logger.error(
"Uncompressed dataset still missing at %s after extraction",
cls.MORPHALOU_DIR_PATH,
)
raise Exception(
f"Uncompressed dataset still missing at {cls.MORPHALOU_DIR_PATH} after extraction"
)
def parse(self):
"""Parses the dataset"""
self.__class__._ensure_uncompressed()
for cat, cat_file in self.__class__.CAT_MAPPING.items():
word_db_elt = WordDb.CATEGORY_TO_ATTR[cat]
logging.info("Parsing %s...", word_db_elt)
setattr(
self.word_db,
word_db_elt,