patch2vimedit/patch2vimedit/hunk_changes.py

""" Tools to make smart edition to apply a hunk """

from collections import OrderedDict
import logging

logger = logging.getLogger(__name__)


class Levenshtein:
    cache = {}
    moves = {
        "D": (-1, 0),
        "S": (-1, -1),
        "I": (0, -1),
    }

    auto_coalesce = False

    def __init__(self, pre, post):
        self.pre = pre
        self.post = post
        self.work_matrix = None
        self.op_matrix = None

    @classmethod
    def get_full_cache(cls):
        return cls.cache

    def cache_key(self):
        if isinstance(self.pre, list):
            return (tuple(self.pre), tuple(self.post))
        return (self.pre, self.post)

    def getcache(self):
        return self.get_full_cache().get(self.cache_key(), None)

    @classmethod
    def clean_cache(cls):
        cls.cache = {}

    def insert_cost(self, to_insert, pre_idx, post_idx):
        return 1

    def del_cost(self, to_delete, pre_idx, post_idx):
        return 1

    def subst_cost(self, subst_from, subst_to, pre_idx, post_idx):
        return int(subst_from != subst_to)

    @staticmethod
    def _argmin(**kwargs):
        """ Returns the first encountered pair (k, v) with lowest v wrt. `<`. """
        if not kwargs:
            raise Exception("No arguments")
        argmin = next(iter(kwargs))
        valmin = kwargs[argmin]

        for key in kwargs:
            val = kwargs[key]
            if val < valmin:
                argmin = key
                valmin = val
        return (argmin, valmin)

    def compute(self):
        cached = self.getcache()
        if cached:
            return cached

        self.op_matrix = [[None] + ["I"] * len(self.post)] + [
            ["D"] + [None] * (len(self.post)) for _ in range(len(self.pre))
        ]
        self.work_matrix = [
            [0] * (len(self.post) + 1) for _ in range(len(self.pre) + 1)
        ]

        for post_idx in range(len(self.post)):
            self.work_matrix[0][post_idx + 1] = self.work_matrix[0][
                post_idx
            ] + self.insert_cost(self.post[post_idx], 0, post_idx + 1)
        for pre_idx in range(len(self.pre)):
            self.work_matrix[pre_idx + 1][0] = self.work_matrix[pre_idx][
                0
            ] + self.del_cost(self.pre[pre_idx], pre_idx + 1, 0)

        for pre_idx in range(len(self.pre)):
            for post_idx in range(len(self.post)):
                c_insert_cost = self.insert_cost(
                    self.post[post_idx], pre_idx + 1, post_idx + 1
                )
                c_del_cost = self.del_cost(self.pre[pre_idx], pre_idx + 1, post_idx + 1)
                c_subst_cost = self.subst_cost(
                    self.pre[pre_idx], self.post[post_idx], pre_idx + 1, post_idx + 1
                )

                opmin, costmin = self._argmin(
                    **OrderedDict(
                        [
                            ("D", self.work_matrix[pre_idx][post_idx + 1] + c_del_cost),
                            (
                                "I",
                                self.work_matrix[pre_idx + 1][post_idx] + c_insert_cost,
                            ),
                            ("S", self.work_matrix[pre_idx][post_idx] + c_subst_cost),
                        ]
                    )
                )

                self.work_matrix[pre_idx + 1][post_idx + 1] = costmin
                self.op_matrix[pre_idx + 1][post_idx + 1] = opmin
                self.cur_mode = opmin

        ops = []
        pre_pos = len(self.pre)
        post_pos = len(self.post)
        while pre_pos > 0 or post_pos > 0:
            cur_op = self.op_matrix[pre_pos][post_pos]
            cost_pre = pre_pos
            cost_post = post_pos
            row_m, col_m = self.moves[cur_op]
            pre_pos += row_m
            post_pos += col_m
            if cur_op == "S":
                pre_val = self.pre[pre_pos]
                post_val = self.post[post_pos]
                if pre_val == post_val:
                    cur_op = "L"
                ops.append(
                    (
                        cur_op,
                        (pre_pos, post_pos),
                        (pre_val, post_val),
                        self.subst_cost(pre_val, post_val, cost_pre, cost_post),
                    )
                )
            if cur_op == "I":
                post_val = self.post[post_pos]
                ops.append(
                    (
                        cur_op,
                        post_pos,
                        post_val,
                        self.insert_cost(post_val, cost_pre, cost_post),
                    )
                )
            if cur_op == "D":
                pre_val = self.pre[pre_pos]
                ops.append(
                    (
                        cur_op,
                        pre_pos,
                        pre_val,
                        self.del_cost(pre_val, cost_pre, cost_post),
                    )
                )

        ops = ops[::-1]
        if self.auto_coalesce:
            ops = self.coalesce_ops(ops)

        cached_result = {
            "count": self.work_matrix[-1][-1],
            "ops": ops,
        }

        self.get_full_cache()[self.cache_key()] = cached_result

        return cached_result

    @staticmethod
    def coalesce_ops(in_ops):
        out_ops = []

        coal_op = None
        coal_pos = None
        coal_span = 0
        coal_vals = None
        coal_cost = 0

        for (op, pos, val, cost) in in_ops:
            if op == coal_op:
                coal_span += 1
                if op == "S":
                    pre, post = val
                    coal_pre, coal_post = coal_vals
                    coal_vals = (coal_pre + pre, coal_post + post)
                else:
                    coal_vals += val
                coal_cost += cost

            else:
                out_ops.append((coal_op, (coal_pos, coal_span), coal_vals, coal_cost))
                coal_op = op
                coal_pos = pos
                coal_span = 1
                coal_vals = val
                coal_cost = cost

        out_ops.append((coal_op, (coal_pos, coal_span), coal_vals, coal_cost))
        return out_ops


class InlineLevenshtein(Levenshtein):
    """ Levenshtein distance for edition of a single line """

    auto_coalesce = True

    def insert_cost(self, to_insert, pre_idx, post_idx):
        return 1

    def del_cost(self, to_delete, pre_idx, post_idx):
        return 1  # 'x'

    def subst_cost(self, subst_from, subst_to, pre_idx, post_idx):
        if subst_from == subst_to:
            return 0
        return 1


class HunkLevenshtein(Levenshtein):
    """ Levenshtein distance for hunk edition, ie. should we delete a whole line,
    insert a whole line or substitute one line with another """

    def insert_cost(self, to_insert, pre_idx, post_idx):
        indent = 0
        while indent < len(to_insert) and to_insert[indent] == " ":
            indent += 1
        indent_cost = 3 if indent else 0
        return indent_cost + len(to_insert.strip()) + 1

    def del_cost(self, to_delete, pre_idx, post_idx):
        return 2

    def subst_cost(self, subst_from, subst_to, pre_idx, post_idx):
        res = InlineLevenshtein(subst_from, subst_to).compute()
        return res["count"] * 1.5