From a872ecb0f97ec28a5fa32d83e69c4bef4691829f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9ophile=20Bastian?= Date: Thu, 19 Sep 2024 21:06:08 +0200 Subject: [PATCH] Morphalou: use new dataset location, uncompress --- pwgen_fr/morphalou.py | 48 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/pwgen_fr/morphalou.py b/pwgen_fr/morphalou.py index abaacec..0405f85 100644 --- a/pwgen_fr/morphalou.py +++ b/pwgen_fr/morphalou.py @@ -1,9 +1,12 @@ """ Reads the Morphalou dataset, in its TSV form """ -import typing as t -from lxml import etree -from pathlib import Path import itertools +import logging +import subprocess +import typing as t +from pathlib import Path + +from lxml import etree from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb @@ -12,11 +15,12 @@ TSV_NS = { "xml": "http://www.w3.org/XML/1998/namespace", } +logger = logging.getLogger(__name__) + class MorphalouSet: MORPHALOU_DIR_PATH = ( - Path(__file__).parent.parent - / "data/raw/morphalou/morphalou/5/Morphalou3.1_formatTEI" + Path(__file__).parent.parent / "data/raw/Morphalou3.1_formatTEI" ) MORPHALOU_FILENAME_TEMPLATE = "{cat_name}_Morphalou3.1_TEI.xml" @@ -32,10 +36,44 @@ class MorphalouSet: def __init__(self): self.word_db = WordDb() + @classmethod + def _ensure_uncompressed(cls): + """Ensures the dataset is uncompressed""" + if cls.MORPHALOU_DIR_PATH.exists(): + return + + lexique_archive = cls.MORPHALOU_DIR_PATH.with_suffix(".tar.xz") + if not lexique_archive.exists(): + logger.error("Missing compressed dataset at %s", lexique_archive) + raise Exception(f"Missing compressed dataset at {lexique_archive}") + + logger.info("Uncompressing dataset") + subprocess.check_call( + [ + "tar", + "-xJf", + lexique_archive.as_posix(), + "-C", + lexique_archive.parent.as_posix(), + ] + ) + + if not cls.MORPHALOU_DIR_PATH.exists(): + logger.error( + "Uncompressed dataset still missing at %s after extraction", + cls.MORPHALOU_DIR_PATH, + ) + raise Exception( + f"Uncompressed dataset still missing at {cls.MORPHALOU_DIR_PATH} after extraction" + ) + def parse(self): """Parses the dataset""" + self.__class__._ensure_uncompressed() + for cat, cat_file in self.__class__.CAT_MAPPING.items(): word_db_elt = WordDb.CATEGORY_TO_ATTR[cat] + logging.info("Parsing %s...", word_db_elt) setattr( self.word_db, word_db_elt,