patch2vimedit/patch2vimedit/hunk_changes.py

237 lines
7.2 KiB
Python

""" Tools to make smart edition to apply a hunk """
from collections import OrderedDict
import logging
logger = logging.getLogger(__name__)
class Levenshtein:
cache = {}
moves = {
"D": (-1, 0),
"S": (-1, -1),
"I": (0, -1),
}
auto_coalesce = False
def __init__(self, pre, post):
self.pre = pre
self.post = post
self.work_matrix = None
self.op_matrix = None
@classmethod
def get_full_cache(cls):
return cls.cache
def cache_key(self):
if isinstance(self.pre, list):
return (tuple(self.pre), tuple(self.post))
return (self.pre, self.post)
def getcache(self):
return self.get_full_cache().get(self.cache_key(), None)
@classmethod
def clean_cache(cls):
cls.cache = {}
def insert_cost(self, to_insert, pre_idx, post_idx):
return 1
def del_cost(self, to_delete, pre_idx, post_idx):
return 1
def subst_cost(self, subst_from, subst_to, pre_idx, post_idx):
return int(subst_from != subst_to)
@staticmethod
def _argmin(**kwargs):
""" Returns the first encountered pair (k, v) with lowest v wrt. `<`. """
if not kwargs:
raise Exception("No arguments")
argmin = next(iter(kwargs))
valmin = kwargs[argmin]
for key in kwargs:
val = kwargs[key]
if val < valmin:
argmin = key
valmin = val
return (argmin, valmin)
def compute(self):
cached = self.getcache()
if cached:
return cached
self.op_matrix = [[None] + ["I"] * len(self.post)] + [
["D"] + [None] * (len(self.post)) for _ in range(len(self.pre))
]
self.work_matrix = [
[0] * (len(self.post) + 1) for _ in range(len(self.pre) + 1)
]
for post_idx in range(len(self.post)):
self.work_matrix[0][post_idx + 1] = self.work_matrix[0][
post_idx
] + self.insert_cost(self.post[post_idx], 0, post_idx + 1)
for pre_idx in range(len(self.pre)):
self.work_matrix[pre_idx + 1][0] = self.work_matrix[pre_idx][
0
] + self.del_cost(self.pre[pre_idx], pre_idx + 1, 0)
for pre_idx in range(len(self.pre)):
for post_idx in range(len(self.post)):
c_insert_cost = self.insert_cost(
self.post[post_idx], pre_idx + 1, post_idx + 1
)
c_del_cost = self.del_cost(self.pre[pre_idx], pre_idx + 1, post_idx + 1)
c_subst_cost = self.subst_cost(
self.pre[pre_idx], self.post[post_idx], pre_idx + 1, post_idx + 1
)
opmin, costmin = self._argmin(
**OrderedDict(
[
("D", self.work_matrix[pre_idx][post_idx + 1] + c_del_cost),
(
"I",
self.work_matrix[pre_idx + 1][post_idx] + c_insert_cost,
),
("S", self.work_matrix[pre_idx][post_idx] + c_subst_cost),
]
)
)
self.work_matrix[pre_idx + 1][post_idx + 1] = costmin
self.op_matrix[pre_idx + 1][post_idx + 1] = opmin
self.cur_mode = opmin
ops = []
pre_pos = len(self.pre)
post_pos = len(self.post)
while pre_pos > 0 or post_pos > 0:
cur_op = self.op_matrix[pre_pos][post_pos]
cost_pre = pre_pos
cost_post = post_pos
row_m, col_m = self.moves[cur_op]
pre_pos += row_m
post_pos += col_m
if cur_op == "S":
pre_val = self.pre[pre_pos]
post_val = self.post[post_pos]
if pre_val == post_val:
cur_op = "L"
ops.append(
(
cur_op,
(pre_pos, post_pos),
(pre_val, post_val),
self.subst_cost(pre_val, post_val, cost_pre, cost_post),
)
)
if cur_op == "I":
post_val = self.post[post_pos]
ops.append(
(
cur_op,
post_pos,
post_val,
self.insert_cost(post_val, cost_pre, cost_post),
)
)
if cur_op == "D":
pre_val = self.pre[pre_pos]
ops.append(
(
cur_op,
pre_pos,
pre_val,
self.del_cost(pre_val, cost_pre, cost_post),
)
)
ops = ops[::-1]
if self.auto_coalesce:
ops = self.coalesce_ops(ops)
cached_result = {
"count": self.work_matrix[-1][-1],
"ops": ops,
}
self.get_full_cache()[self.cache_key()] = cached_result
return cached_result
@staticmethod
def coalesce_ops(in_ops):
out_ops = []
coal_op = None
coal_pos = None
coal_span = 0
coal_vals = None
coal_cost = 0
for (op, pos, val, cost) in in_ops:
if op == coal_op:
coal_span += 1
if op == "S":
pre, post = val
coal_pre, coal_post = coal_vals
coal_vals = (coal_pre + pre, coal_post + post)
else:
coal_vals += val
coal_cost += cost
else:
out_ops.append((coal_op, (coal_pos, coal_span), coal_vals, coal_cost))
coal_op = op
coal_pos = pos
coal_span = 1
coal_vals = val
coal_cost = cost
out_ops.append((coal_op, (coal_pos, coal_span), coal_vals, coal_cost))
return out_ops
class InlineLevenshtein(Levenshtein):
""" Levenshtein distance for edition of a single line """
auto_coalesce = True
def insert_cost(self, to_insert, pre_idx, post_idx):
return 1
def del_cost(self, to_delete, pre_idx, post_idx):
return 1 # 'x'
def subst_cost(self, subst_from, subst_to, pre_idx, post_idx):
if subst_from == subst_to:
return 0
return 1
class HunkLevenshtein(Levenshtein):
""" Levenshtein distance for hunk edition, ie. should we delete a whole line,
insert a whole line or substitute one line with another """
def insert_cost(self, to_insert, pre_idx, post_idx):
indent = 0
while indent < len(to_insert) and to_insert[indent] == " ":
indent += 1
indent_cost = 3 if indent else 0
return indent_cost + len(to_insert.strip()) + 1
def del_cost(self, to_delete, pre_idx, post_idx):
return 2
def subst_cost(self, subst_from, subst_to, pre_idx, post_idx):
res = InlineLevenshtein(subst_from, subst_to).compute()
return res["count"] * 1.5