Morphalou: use new dataset location, uncompress
This commit is contained in:
parent
e8379656e1
commit
a872ecb0f9
1 changed files with 43 additions and 5 deletions
|
@ -1,9 +1,12 @@
|
||||||
""" Reads the Morphalou dataset, in its TSV form """
|
""" Reads the Morphalou dataset, in its TSV form """
|
||||||
|
|
||||||
import typing as t
|
|
||||||
from lxml import etree
|
|
||||||
from pathlib import Path
|
|
||||||
import itertools
|
import itertools
|
||||||
|
import logging
|
||||||
|
import subprocess
|
||||||
|
import typing as t
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
|
from .word_db import Adjectif, Adverbe, Genre, Nom, Nombre, Temps, Verbe, WordDb
|
||||||
|
|
||||||
|
@ -12,11 +15,12 @@ TSV_NS = {
|
||||||
"xml": "http://www.w3.org/XML/1998/namespace",
|
"xml": "http://www.w3.org/XML/1998/namespace",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class MorphalouSet:
|
class MorphalouSet:
|
||||||
MORPHALOU_DIR_PATH = (
|
MORPHALOU_DIR_PATH = (
|
||||||
Path(__file__).parent.parent
|
Path(__file__).parent.parent / "data/raw/Morphalou3.1_formatTEI"
|
||||||
/ "data/raw/morphalou/morphalou/5/Morphalou3.1_formatTEI"
|
|
||||||
)
|
)
|
||||||
MORPHALOU_FILENAME_TEMPLATE = "{cat_name}_Morphalou3.1_TEI.xml"
|
MORPHALOU_FILENAME_TEMPLATE = "{cat_name}_Morphalou3.1_TEI.xml"
|
||||||
|
|
||||||
|
@ -32,10 +36,44 @@ class MorphalouSet:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.word_db = WordDb()
|
self.word_db = WordDb()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _ensure_uncompressed(cls):
|
||||||
|
"""Ensures the dataset is uncompressed"""
|
||||||
|
if cls.MORPHALOU_DIR_PATH.exists():
|
||||||
|
return
|
||||||
|
|
||||||
|
lexique_archive = cls.MORPHALOU_DIR_PATH.with_suffix(".tar.xz")
|
||||||
|
if not lexique_archive.exists():
|
||||||
|
logger.error("Missing compressed dataset at %s", lexique_archive)
|
||||||
|
raise Exception(f"Missing compressed dataset at {lexique_archive}")
|
||||||
|
|
||||||
|
logger.info("Uncompressing dataset")
|
||||||
|
subprocess.check_call(
|
||||||
|
[
|
||||||
|
"tar",
|
||||||
|
"-xJf",
|
||||||
|
lexique_archive.as_posix(),
|
||||||
|
"-C",
|
||||||
|
lexique_archive.parent.as_posix(),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
if not cls.MORPHALOU_DIR_PATH.exists():
|
||||||
|
logger.error(
|
||||||
|
"Uncompressed dataset still missing at %s after extraction",
|
||||||
|
cls.MORPHALOU_DIR_PATH,
|
||||||
|
)
|
||||||
|
raise Exception(
|
||||||
|
f"Uncompressed dataset still missing at {cls.MORPHALOU_DIR_PATH} after extraction"
|
||||||
|
)
|
||||||
|
|
||||||
def parse(self):
|
def parse(self):
|
||||||
"""Parses the dataset"""
|
"""Parses the dataset"""
|
||||||
|
self.__class__._ensure_uncompressed()
|
||||||
|
|
||||||
for cat, cat_file in self.__class__.CAT_MAPPING.items():
|
for cat, cat_file in self.__class__.CAT_MAPPING.items():
|
||||||
word_db_elt = WordDb.CATEGORY_TO_ATTR[cat]
|
word_db_elt = WordDb.CATEGORY_TO_ATTR[cat]
|
||||||
|
logging.info("Parsing %s...", word_db_elt)
|
||||||
setattr(
|
setattr(
|
||||||
self.word_db,
|
self.word_db,
|
||||||
word_db_elt,
|
word_db_elt,
|
||||||
|
|
Loading…
Add table
Reference in a new issue