Morphalou: use new dataset location, uncompress
This commit is contained in:
parent
e8379656e1
commit
a872ecb0f9
1 changed files with 43 additions and 5 deletions
|
@ -1,9 +1,12 @@
|
|||
""" Reads the Morphalou dataset, in its TSV form """
|
||||
|
||||
import typing as t
|
||||
from lxml import etree
|
||||
from pathlib import Path
|
||||
import itertools
|
||||
import logging
|
||||
import subprocess
|
||||
import typing as t
|
||||
from pathlib import Path
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
|
||||
|
||||
|
@ -12,11 +15,12 @@ TSV_NS = {
|
|||
"xml": "http://www.w3.org/XML/1998/namespace",
|
||||
}
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MorphalouSet:
|
||||
MORPHALOU_DIR_PATH = (
|
||||
Path(__file__).parent.parent
|
||||
/ "data/raw/morphalou/morphalou/5/Morphalou3.1_formatTEI"
|
||||
Path(__file__).parent.parent / "data/raw/Morphalou3.1_formatTEI"
|
||||
)
|
||||
MORPHALOU_FILENAME_TEMPLATE = "{cat_name}_Morphalou3.1_TEI.xml"
|
||||
|
||||
|
@ -32,10 +36,44 @@ class MorphalouSet:
|
|||
def __init__(self):
|
||||
self.word_db = WordDb()
|
||||
|
||||
@classmethod
|
||||
def _ensure_uncompressed(cls):
|
||||
"""Ensures the dataset is uncompressed"""
|
||||
if cls.MORPHALOU_DIR_PATH.exists():
|
||||
return
|
||||
|
||||
lexique_archive = cls.MORPHALOU_DIR_PATH.with_suffix(".tar.xz")
|
||||
if not lexique_archive.exists():
|
||||
logger.error("Missing compressed dataset at %s", lexique_archive)
|
||||
raise Exception(f"Missing compressed dataset at {lexique_archive}")
|
||||
|
||||
logger.info("Uncompressing dataset")
|
||||
subprocess.check_call(
|
||||
[
|
||||
"tar",
|
||||
"-xJf",
|
||||
lexique_archive.as_posix(),
|
||||
"-C",
|
||||
lexique_archive.parent.as_posix(),
|
||||
]
|
||||
)
|
||||
|
||||
if not cls.MORPHALOU_DIR_PATH.exists():
|
||||
logger.error(
|
||||
"Uncompressed dataset still missing at %s after extraction",
|
||||
cls.MORPHALOU_DIR_PATH,
|
||||
)
|
||||
raise Exception(
|
||||
f"Uncompressed dataset still missing at {cls.MORPHALOU_DIR_PATH} after extraction"
|
||||
)
|
||||
|
||||
def parse(self):
|
||||
"""Parses the dataset"""
|
||||
self.__class__._ensure_uncompressed()
|
||||
|
||||
for cat, cat_file in self.__class__.CAT_MAPPING.items():
|
||||
word_db_elt = WordDb.CATEGORY_TO_ATTR[cat]
|
||||
logging.info("Parsing %s...", word_db_elt)
|
||||
setattr(
|
||||
self.word_db,
|
||||
word_db_elt,
|
||||
|
|
Loading…
Reference in a new issue