pwgen-fr/pwgen_fr/lexique.py

import csv
import itertools
from dataclasses import dataclass, field
import logging
import subprocess
import typing as t
from bisect import bisect_left
import enum
from pathlib import Path
from .word_db import Genre, Nombre, Temps, Nom, Adjectif, Verbe, Adverbe, WordDb

logger = logging.getLogger(__name__)


class CatGram(enum.Enum):
    NOM = "NOM"
    VERBE = "VER"
    ADJECTIF = "ADJ"
    ADVERBE = "ADV"
    AUXILIAIRE = "AUX"
    ARTICLE = "ART"
    CONJONCTION = "CON"
    LIAISON = "LIA"
    PREPOSITION = "PRE"
    PRONOM = "PRO"
    ONOMATOPEE = "ONO"

    @classmethod
    def parse(cls, val: str) -> "CatGram":
        """Parses a 'catgram' entry"""
        base = val.split(":", maxsplit=1)[0]
        return cls(base)

    def __lt__(self, oth):
        return self.value < oth.value


def match_enum_or_all(val: str, enum_mapper, enum_cls) -> list:
    """The value of the enum corresponding if any; else, all terms of the enum"""
    if val in enum_mapper:
        return [enum_mapper[val]]
    return list(enum_cls)


@dataclass
class Mot:
    mot: str
    lemme: str
    cat_gram: CatGram
    freq: float  # occurrences of the canonical form by million words
    variantes: dict[tuple, str] = field(default_factory=dict)
    genre: t.Optional[Genre] = None


class Lexique:
    LEXIQUE_DIR_PATH = Path(__file__).parent.parent / "data/raw/Lexique383"
    LEXIQUE_PATH = LEXIQUE_DIR_PATH / "Lexique383.tsv"

    PRESET_THRESHOLD_BY_CAT: dict[CatGram, int] = {
        CatGram.NOM: 10000,
        CatGram.VERBE: 10000,
        CatGram.ADJECTIF: 10000,
        CatGram.ADVERBE: 10000,
    }

    class Parsers:
        """Datatables to help parse the original data"""

        genre: dict[str, Genre] = {
            "m": Genre.MASC,
            "f": Genre.FEM,
        }
        rev_genre: dict[t.Optional[Genre], str] = {
            None: "",
            Genre.MASC: "m",
            Genre.FEM: "f",
        }
        nombre: dict[str, Nombre] = {
            "s": Nombre.SING,
            "p": Nombre.PLUR,
        }
        verbe_temps: dict[str, Temps] = {
            "ind:pre": Temps.PRESENT,
            "ind:fut": Temps.FUTUR,
            "ind:imp": Temps.IMPARFAIT,
        }
        verbe_personne: dict[str, Nombre] = {
            "3s": Nombre.SING,
            "3p": Nombre.PLUR,
        }

    dataset: list[Mot]

    def __init__(self, dataset):
        self.dataset = dataset

    @classmethod
    def _ensure_uncompressed(cls):
        """Ensures the dataset is uncompressed"""
        if cls.LEXIQUE_DIR_PATH.exists():
            return

        lexique_archive = cls.LEXIQUE_DIR_PATH.with_suffix(".tar.xz")
        if not lexique_archive.exists():
            logging.error("Missing compressed dataset at %s", lexique_archive)
            raise Exception(f"Missing compressed dataset at {lexique_archive}")

        logging.info("Uncompressing dataset")
        subprocess.check_call(
            [
                "tar",
                "-xJf",
                lexique_archive.as_posix(),
                "-C",
                lexique_archive.parent.as_posix(),
            ]
        )

        if not cls.LEXIQUE_DIR_PATH.exists():
            logging.error(
                "Uncompressed dataset still missing at %s after extraction",
                cls.LEXIQUE_DIR_PATH,
            )
            raise Exception(
                f"Uncompressed dataset still missing at {cls.LEXIQUE_DIR_PATH} after extraction"
            )

    @classmethod
    def _find_word_key(cls, mot: Mot):
        return (mot.lemme, mot.cat_gram, cls.Parsers.rev_genre[mot.genre])

    @classmethod
    def _find_word(cls, dataset: list[Mot], row: dict) -> t.Optional[Mot]:
        str_lemme = row["lemme"]
        cat_gram = CatGram.parse(row["cgram"])
        genre = row["genre"] if cat_gram == CatGram.NOM else ""
        row_key = (
            str_lemme,
            cat_gram,
            genre,
        )
        lemme_pos = bisect_left(
            dataset,
            row_key,
            key=cls._find_word_key,
        )
        if lemme_pos >= len(dataset):
            return None
        out = dataset[lemme_pos]
        if row_key != cls._find_word_key(out):
            return None
        return dataset[lemme_pos]

    @classmethod
    def parse(cls) -> "Lexique":
        out = []
        rows = []
        with cls.LEXIQUE_PATH.open("r") as h:
            reader = csv.DictReader(h, dialect="excel-tab")
            for row in reader:
                if not row["cgram"]:
                    continue
                rows.append(row)

        # First pass: generate canonical forms (lemmes)
        for row in rows:
            cat_gram = CatGram.parse(row["cgram"])
            if (row["lemme"] != row["ortho"]) and not (
                cat_gram == CatGram.NOM and row["genre"] == "f" and row["nombre"] == "s"
            ):
                # Un nom singulier féminin est considéré comme forme canonique
                continue

            genre: t.Optional[Genre] = None
            if cat_gram == CatGram.NOM:
                genre = cls.Parsers.genre.get(row["genre"], None)
            out.append(
                Mot(
                    mot=row["ortho"],
                    lemme=row["lemme"],
                    cat_gram=cat_gram,
                    freq=float(row["freqlemlivres"]),
                    genre=genre,
                )
            )

        out.sort(key=cls._find_word_key)  # We need to bisect on this.

        # Second pass: populate variants
        for row in rows:
            lemme = cls._find_word(out, row)
            if lemme is None:
                continue

            if lemme.cat_gram == CatGram.NOM:
                nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre)
                for nombre in nombres:
                    lemme.variantes[(nombre,)] = row["ortho"]

            elif lemme.cat_gram == CatGram.VERBE:
                infover = row["infover"].split(";")
                for raw_ver in infover:
                    ver = raw_ver.split(":")

                    temps = None
                    personne = None
                    temps_select = ":".join(ver[0:2])
                    if temps_select not in Temps:
                        continue
                    temps = Temps(temps_select)
                    personne = cls.Parsers.verbe_personne.get(ver[2], None)
                    if personne is None:
                        continue  # we're not interested in all conj. persons

                    lemme.variantes[(temps, personne)] = row["ortho"]

            elif lemme.cat_gram == CatGram.ADJECTIF:
                genres = match_enum_or_all(row["genre"], cls.Parsers.genre, Genre)
                nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre)
                for genre, nombre in itertools.product(genres, nombres):
                    lemme.variantes[(genre, nombre)] = row["ortho"]

            # No need to match adverbs (invariant)
        return cls(out)

    def most_common(
        self, cat_gram: CatGram, threshold: t.Optional[int] = None
    ) -> list[Mot]:
        if threshold is None:
            try:
                threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram]
            except KeyError as exn:
                raise ValueError(
                    f"No threshold preset for grammatical category {cat_gram}, "
                    "please provide a threshold manually"
                ) from exn
        out = list(filter(lambda word: word.cat_gram == cat_gram, self.dataset))
        out.sort(key=lambda word: word.freq, reverse=True)
        return out[:threshold]

    def word_db(self, thresholds: t.Optional[dict[CatGram, int]] = None) -> WordDb:
        """Convert to a WordDb"""
        thresholds = thresholds or {}

        noms = self.most_common(CatGram.NOM, thresholds.get(CatGram.NOM, None))
        db_noms = [
            Nom(
                genre=t.cast(Genre, nom.genre),  # not None for noms
                sing=nom.variantes[(Nombre.SING,)],
                plur=nom.variantes[(Nombre.PLUR,)],
            )
            for nom in noms
        ]

        adjectifs = self.most_common(
            CatGram.ADJECTIF, thresholds.get(CatGram.ADJECTIF, None)
        )
        db_adjectifs = [
            Adjectif(
                masc_sing=adj.variantes[(Genre.MASC, Nombre.SING)],
                masc_plur=adj.variantes[(Genre.MASC, Nombre.PLUR)],
                fem_sing=adj.variantes[(Genre.FEM, Nombre.SING)],
                fem_plur=adj.variantes[(Genre.FEM, Nombre.PLUR)],
            )
            for adj in adjectifs
        ]

        verbes = self.most_common(CatGram.VERBE, thresholds.get(CatGram.VERBE, None))
        db_verbes = [
            Verbe(
                present_sing=verbe.variantes[(Temps.PRESENT, Nombre.SING)],
                present_plur=verbe.variantes[(Temps.PRESENT, Nombre.PLUR)],
                futur_sing=verbe.variantes[(Temps.FUTUR, Nombre.SING)],
                futur_plur=verbe.variantes[(Temps.FUTUR, Nombre.PLUR)],
                imparfait_sing=verbe.variantes[(Temps.IMPARFAIT, Nombre.SING)],
                imparfait_plur=verbe.variantes[(Temps.IMPARFAIT, Nombre.PLUR)],
            )
            for verbe in verbes
        ]

        adverbes = self.most_common(
            CatGram.ADVERBE, thresholds.get(CatGram.ADVERBE, None)
        )
        db_adverbes = [Adverbe(adv=adv.mot) for adv in adverbes]

        return WordDb(
            noms=db_noms, adjectifs=db_adjectifs, verbes=db_verbes, adverbes=db_adverbes
        )
Some dataset parsing, some tentative generation 2024-08-14 16:50:46 +02:00			`import csv`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00			`import itertools`
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`from dataclasses import dataclass, field`
Some dataset parsing, some tentative generation 2024-08-14 16:50:46 +02:00			`import logging`
			`import subprocess`
			`import typing as t`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00			`from bisect import bisect_left`
Some dataset parsing, some tentative generation 2024-08-14 16:50:46 +02:00			`import enum`
			`from pathlib import Path`
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`from .word_db import Genre, Nombre, Temps, Nom, Adjectif, Verbe, Adverbe, WordDb`
Some dataset parsing, some tentative generation 2024-08-14 16:50:46 +02:00
			`logger = logging.getLogger(__name__)`


			`class CatGram(enum.Enum):`
			`NOM = "NOM"`
			`VERBE = "VER"`
			`ADJECTIF = "ADJ"`
			`ADVERBE = "ADV"`
			`AUXILIAIRE = "AUX"`
			`ARTICLE = "ART"`
			`CONJONCTION = "CON"`
			`LIAISON = "LIA"`
			`PREPOSITION = "PRE"`
			`PRONOM = "PRO"`
			`ONOMATOPEE = "ONO"`

			`@classmethod`
			`def parse(cls, val: str) -> "CatGram":`
			`"""Parses a 'catgram' entry"""`
			`base = val.split(":", maxsplit=1)[0]`
			`return cls(base)`

Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00			`def __lt__(self, oth):`
			`return self.value < oth.value`
Some dataset parsing, some tentative generation 2024-08-14 16:50:46 +02:00
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`def match_enum_or_all(val: str, enum_mapper, enum_cls) -> list:`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00			`"""The value of the enum corresponding if any; else, all terms of the enum"""`
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`if val in enum_mapper:`
			`return [enum_mapper[val]]`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00			`return list(enum_cls)`


			`@dataclass`
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`class Mot:`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00			`mot: str`
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`lemme: str`
Some dataset parsing, some tentative generation 2024-08-14 16:50:46 +02:00			`cat_gram: CatGram`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00			`freq: float # occurrences of the canonical form by million words`
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`variantes: dict[tuple, str] = field(default_factory=dict)`
			`genre: t.Optional[Genre] = None`
Some dataset parsing, some tentative generation 2024-08-14 16:50:46 +02:00

			`class Lexique:`
			`LEXIQUE_DIR_PATH = Path(__file__).parent.parent / "data/raw/Lexique383"`
			`LEXIQUE_PATH = LEXIQUE_DIR_PATH / "Lexique383.tsv"`

			`PRESET_THRESHOLD_BY_CAT: dict[CatGram, int] = {`
			`CatGram.NOM: 10000,`
			`CatGram.VERBE: 10000,`
			`CatGram.ADJECTIF: 10000,`
			`CatGram.ADVERBE: 10000,`
			`}`

Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`class Parsers:`
			`"""Datatables to help parse the original data"""`

			`genre: dict[str, Genre] = {`
			`"m": Genre.MASC,`
			`"f": Genre.FEM,`
			`}`
			`rev_genre: dict[t.Optional[Genre], str] = {`
			`None: "",`
			`Genre.MASC: "m",`
			`Genre.FEM: "f",`
			`}`
			`nombre: dict[str, Nombre] = {`
			`"s": Nombre.SING,`
			`"p": Nombre.PLUR,`
			`}`
			`verbe_temps: dict[str, Temps] = {`
			`"ind:pre": Temps.PRESENT,`
			`"ind:fut": Temps.FUTUR,`
			`"ind:imp": Temps.IMPARFAIT,`
			`}`
			`verbe_personne: dict[str, Nombre] = {`
			`"3s": Nombre.SING,`
			`"3p": Nombre.PLUR,`
			`}`

Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00			`dataset: list[Mot]`
Some dataset parsing, some tentative generation 2024-08-14 16:50:46 +02:00
			`def __init__(self, dataset):`
			`self.dataset = dataset`

			`@classmethod`
			`def _ensure_uncompressed(cls):`
			`"""Ensures the dataset is uncompressed"""`
			`if cls.LEXIQUE_DIR_PATH.exists():`
			`return`

			`lexique_archive = cls.LEXIQUE_DIR_PATH.with_suffix(".tar.xz")`
			`if not lexique_archive.exists():`
			`logging.error("Missing compressed dataset at %s", lexique_archive)`
			`raise Exception(f"Missing compressed dataset at {lexique_archive}")`

			`logging.info("Uncompressing dataset")`
			`subprocess.check_call(`
			`[`
			`"tar",`
			`"-xJf",`
			`lexique_archive.as_posix(),`
			`"-C",`
			`lexique_archive.parent.as_posix(),`
			`]`
			`)`

			`if not cls.LEXIQUE_DIR_PATH.exists():`
			`logging.error(`
			`"Uncompressed dataset still missing at %s after extraction",`
			`cls.LEXIQUE_DIR_PATH,`
			`)`
			`raise Exception(`
			`f"Uncompressed dataset still missing at {cls.LEXIQUE_DIR_PATH} after extraction"`
			`)`

Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`@classmethod`
			`def _find_word_key(cls, mot: Mot):`
			`return (mot.lemme, mot.cat_gram, cls.Parsers.rev_genre[mot.genre])`

			`@classmethod`
			`def _find_word(cls, dataset: list[Mot], row: dict) -> t.Optional[Mot]:`
			`str_lemme = row["lemme"]`
			`cat_gram = CatGram.parse(row["cgram"])`
			`genre = row["genre"] if cat_gram == CatGram.NOM else ""`
			`row_key = (`
			`str_lemme,`
			`cat_gram,`
			`genre,`
			`)`
			`lemme_pos = bisect_left(`
			`dataset,`
			`row_key,`
			`key=cls._find_word_key,`
			`)`
			`if lemme_pos >= len(dataset):`
			`return None`
			`out = dataset[lemme_pos]`
			`if row_key != cls._find_word_key(out):`
			`return None`
			`return dataset[lemme_pos]`

Some dataset parsing, some tentative generation 2024-08-14 16:50:46 +02:00			`@classmethod`
			`def parse(cls) -> "Lexique":`
			`out = []`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00			`rows = []`
Some dataset parsing, some tentative generation 2024-08-14 16:50:46 +02:00			`with cls.LEXIQUE_PATH.open("r") as h:`
			`reader = csv.DictReader(h, dialect="excel-tab")`
			`for row in reader:`
			`if not row["cgram"]:`
			`continue`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00			`rows.append(row)`

			`# First pass: generate canonical forms (lemmes)`
			`for row in rows:`
			`cat_gram = CatGram.parse(row["cgram"])`
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`if (row["lemme"] != row["ortho"]) and not (`
			`cat_gram == CatGram.NOM and row["genre"] == "f" and row["nombre"] == "s"`
			`):`
			`# Un nom singulier féminin est considéré comme forme canonique`
			`continue`

			`genre: t.Optional[Genre] = None`
			`if cat_gram == CatGram.NOM:`
			`genre = cls.Parsers.genre.get(row["genre"], None)`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00			`out.append(`
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`Mot(`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00			`mot=row["ortho"],`
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`lemme=row["lemme"],`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00			`cat_gram=cat_gram,`
			`freq=float(row["freqlemlivres"]),`
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`genre=genre,`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00			`)`
			`)`

Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`out.sort(key=cls._find_word_key) # We need to bisect on this.`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00
			`# Second pass: populate variants`
			`for row in rows:`
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`lemme = cls._find_word(out, row)`
			`if lemme is None:`
			`continue`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00
			`if lemme.cat_gram == CatGram.NOM:`
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre)`
			`for nombre in nombres:`
			`lemme.variantes[(nombre,)] = row["ortho"]`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00
			`elif lemme.cat_gram == CatGram.VERBE:`
			`infover = row["infover"].split(";")`
			`for raw_ver in infover:`
			`ver = raw_ver.split(":")`

			`temps = None`
			`personne = None`
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`temps_select = ":".join(ver[0:2])`
			`if temps_select not in Temps:`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00			`continue`
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`temps = Temps(temps_select)`
			`personne = cls.Parsers.verbe_personne.get(ver[2], None)`
			`if personne is None:`
			`continue # we're not interested in all conj. persons`

			`lemme.variantes[(temps, personne)] = row["ortho"]`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`elif lemme.cat_gram == CatGram.ADJECTIF:`
			`genres = match_enum_or_all(row["genre"], cls.Parsers.genre, Genre)`
			`nombres = match_enum_or_all(row["nombre"], cls.Parsers.nombre, Nombre)`
			`for genre, nombre in itertools.product(genres, nombres):`
			`lemme.variantes[(genre, nombre)] = row["ortho"]`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00			`# No need to match adverbs (invariant)`
Some dataset parsing, some tentative generation 2024-08-14 16:50:46 +02:00			`return cls(out)`

			`def most_common(`
			`self, cat_gram: CatGram, threshold: t.Optional[int] = None`
Accords des noms, conjugaison des verbes (WiP) 2024-09-09 00:28:50 +02:00			`) -> list[Mot]:`
Some dataset parsing, some tentative generation 2024-08-14 16:50:46 +02:00			`if threshold is None:`
			`try:`
			`threshold = self.PRESET_THRESHOLD_BY_CAT[cat_gram]`
			`except KeyError as exn:`
			`raise ValueError(`
			`f"No threshold preset for grammatical category {cat_gram}, "`
			`"please provide a threshold manually"`
			`) from exn`
			`out = list(filter(lambda word: word.cat_gram == cat_gram, self.dataset))`
			`out.sort(key=lambda word: word.freq, reverse=True)`
			`return out[:threshold]`
Lexique: work towards word_db Still problems with other forms 2024-09-10 00:30:29 +02:00
			`def word_db(self, thresholds: t.Optional[dict[CatGram, int]] = None) -> WordDb:`
			`"""Convert to a WordDb"""`
			`thresholds = thresholds or {}`

			`noms = self.most_common(CatGram.NOM, thresholds.get(CatGram.NOM, None))`
			`db_noms = [`
			`Nom(`
			`genre=t.cast(Genre, nom.genre), # not None for noms`
			`sing=nom.variantes[(Nombre.SING,)],`
			`plur=nom.variantes[(Nombre.PLUR,)],`
			`)`
			`for nom in noms`
			`]`

			`adjectifs = self.most_common(`
			`CatGram.ADJECTIF, thresholds.get(CatGram.ADJECTIF, None)`
			`)`
			`db_adjectifs = [`
			`Adjectif(`
			`masc_sing=adj.variantes[(Genre.MASC, Nombre.SING)],`
			`masc_plur=adj.variantes[(Genre.MASC, Nombre.PLUR)],`
			`fem_sing=adj.variantes[(Genre.FEM, Nombre.SING)],`
			`fem_plur=adj.variantes[(Genre.FEM, Nombre.PLUR)],`
			`)`
			`for adj in adjectifs`
			`]`

			`verbes = self.most_common(CatGram.VERBE, thresholds.get(CatGram.VERBE, None))`
			`db_verbes = [`
			`Verbe(`
			`present_sing=verbe.variantes[(Temps.PRESENT, Nombre.SING)],`
			`present_plur=verbe.variantes[(Temps.PRESENT, Nombre.PLUR)],`
			`futur_sing=verbe.variantes[(Temps.FUTUR, Nombre.SING)],`
			`futur_plur=verbe.variantes[(Temps.FUTUR, Nombre.PLUR)],`
			`imparfait_sing=verbe.variantes[(Temps.IMPARFAIT, Nombre.SING)],`
			`imparfait_plur=verbe.variantes[(Temps.IMPARFAIT, Nombre.PLUR)],`
			`)`
			`for verbe in verbes`
			`]`

			`adverbes = self.most_common(`
			`CatGram.ADVERBE, thresholds.get(CatGram.ADVERBE, None)`
			`)`
			`db_adverbes = [Adverbe(adv=adv.mot) for adv in adverbes]`

			`return WordDb(`
			`noms=db_noms, adjectifs=db_adjectifs, verbes=db_verbes, adverbes=db_adverbes`
			`)`