Switch to Diff Match Patch (diffz) (#982)

* Add tests, note about correctness issue

* Use diffz (DiffMatchPatch)
This commit is contained in:
Auguste Rame 2023-02-11 14:21:10 -05:00 committed by GitHub
parent 73d6264cab
commit bf19ed3ea9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 150 additions and 341 deletions

3
.gitmodules vendored
View File

@ -7,3 +7,6 @@
[submodule "src/tres"]
path = src/tres
url = https://github.com/ziglibs/tres.git
[submodule "src/diffz"]
path = src/diffz
url = https://github.com/ziglibs/diffz

View File

@ -120,9 +120,15 @@ pub fn build(b: *std.build.Builder) !void {
const tres_module = b.createModule(.{ .source_file = .{ .path = tres_path } });
exe.addModule("tres", tres_module);
const DIFFZ_DEFAULT_PATH = "src/diffz/DiffMatchPatch.zig";
const diffz_path = b.option([]const u8, "diffz", "Path to diffz package (default: " ++ DIFFZ_DEFAULT_PATH ++ ")") orelse DIFFZ_DEFAULT_PATH;
const diffz_module = b.createModule(.{ .source_file = .{ .path = diffz_path } });
exe.addModule("diffz", diffz_module);
const check_submodules_step = CheckSubmodulesStep.init(b, &.{
known_folders_path,
tres_path,
diffz_path,
});
b.getInstallStep().dependOn(&check_submodules_step.step);
@ -201,11 +207,13 @@ pub fn build(b: *std.build.Builder) !void {
.dependencies = &.{
.{ .name = "known-folders", .module = known_folders_module },
.{ .name = "tres", .module = tres_module },
.{ .name = "diffz", .module = diffz_module },
.{ .name = "build_options", .module = build_options_module },
},
});
tests.addModule("zls", zls_module);
tests.addModule("tres", tres_module);
tests.addModule("diffz", diffz_module);
test_step.dependOn(&tests.step);
}

View File

@ -2087,7 +2087,7 @@ fn changeDocumentHandler(server: *Server, notification: types.DidChangeTextDocum
const handle = server.document_store.getHandle(notification.textDocument.uri) orelse return;
const new_text = try diff.applyTextEdits(server.allocator, handle.text, notification.contentChanges, server.offset_encoding);
const new_text = try diff.applyContentChanges(server.allocator, handle.text, notification.contentChanges, server.offset_encoding);
try server.document_store.refreshDocument(handle.uri, new_text);
@ -2372,7 +2372,7 @@ fn formattingHandler(server: *Server, request: types.DocumentFormattingParams) E
return text_edits;
}
return if (diff.edits(allocator, handle.text, formatted)) |text_edits| text_edits.items else |_| null;
return if (diff.edits(allocator, handle.text, formatted, server.offset_encoding)) |text_edits| text_edits.items else |_| null;
}
fn didChangeConfigurationHandler(server: *Server, request: configuration.DidChangeConfigurationParams) Error!void {

View File

@ -1,358 +1,47 @@
const std = @import("std");
const types = @import("lsp.zig");
const offsets = @import("offsets.zig");
const DiffMatchPatch = @import("diffz");
pub const Error = error{ OutOfMemory, InvalidRange };
// Whether the `Change` is an addition, deletion, or no change from the
// original string to the new string
const Operation = enum { Deletion, Addition, Nothing };
/// A single character difference between two strings
const Change = struct {
operation: Operation,
pos: usize,
value: ?u8,
const dmp = DiffMatchPatch{
.diff_timeout = 250,
};
/// Given two input strings, `a` and `b`, return a list of Edits that
/// describe the changes from `a` to `b`
pub const Error = error{ OutOfMemory, InvalidRange, UnknownError };
pub fn edits(
allocator: std.mem.Allocator,
a: []const u8,
b: []const u8,
before: []const u8,
after: []const u8,
encoding: offsets.Encoding,
) Error!std.ArrayListUnmanaged(types.TextEdit) {
// Given the input strings A and B, we skip over the first N characters
// where A[0..N] == B[0..N]. We want to trim the start (and end) of the
// strings that have the same text. This decreases the size of the LCS
// table and makes the diff comparison more efficient
var a_trim: []const u8 = a;
var b_trim: []const u8 = b;
const a_trim_offset = trim_input(&a_trim, &b_trim);
var diffs = try dmp.diff(allocator, before, after, true);
var eds = std.ArrayListUnmanaged(types.TextEdit){};
const rows = a_trim.len + 1;
const cols = b_trim.len + 1;
var lcs = try Array2D.new(allocator, rows, cols);
defer lcs.deinit();
calculate_lcs(&lcs, a_trim, b_trim);
return try get_changes(
&lcs,
a,
a_trim_offset,
a_trim,
b_trim,
allocator,
);
}
fn trim_input(a_out: *[]const u8, b_out: *[]const u8) usize {
if (a_out.len == 0 or b_out.len == 0) return 0;
var a: []const u8 = a_out.*;
var b: []const u8 = b_out.*;
// Trim the beginning of the string
var start: usize = 0;
while (start < a.len and start < b.len and a[start] == b[start]) : ({
start += 1;
}) {}
// Trim the end of the string
var end: usize = 1;
while (end < a.len and end < b.len and a[a.len - end] == b[b.len - end]) : ({
end += 1;
}) {}
end -= 1;
var a_start = start;
var a_end = a.len - end;
var b_start = start;
var b_end = b.len - end;
// In certain situations, the trimmed range can be "negative" where
// `a_start` ends up being after `a_end` in the byte stream. If you
// consider the following inputs:
// a: "xx gg xx"
// b: "xx gg xx"
//
// This will lead to the following calculations:
// a_start: 4
// a_end: 4
// b_start: 4
// b_end: 2
//
// In negative range situations, we add the absolute value of the
// the negative range's length (`b_start - b_end` in this case) to the
// other range's length (a_end + (b_start - b_end)), and then set the
// negative range end to the negative range start (b_end = b_start)
if (a_start > a_end) {
const difference = a_start - a_end;
a_end = a_start;
b_end += difference;
}
if (b_start > b_end) {
const difference = b_start - b_end;
b_end = b_start;
a_end += difference;
}
a_out.* = a[a_start..a_end];
b_out.* = b[b_start..b_end];
return start;
}
/// A 2D array that is addressable as a[row, col]
pub const Array2D = struct {
const Self = @This();
data: [*]usize,
allocator: std.mem.Allocator,
rows: usize,
cols: usize,
pub fn new(
allocator: std.mem.Allocator,
rows: usize,
cols: usize,
) error{OutOfMemory}!Self {
const data = try allocator.alloc(usize, rows * cols);
return Self{
.data = data.ptr,
.allocator = allocator,
.rows = rows,
.cols = cols,
};
}
pub fn deinit(self: *Self) void {
self.allocator.free(self.data[0 .. self.rows * self.cols]);
}
pub fn get(self: *Self, row: usize, col: usize) *usize {
return @ptrCast(*usize, self.data + (row * self.cols) + col);
}
};
/// Build a Longest Common Subsequence table
fn calculate_lcs(
lcs: *Array2D,
astr: []const u8,
bstr: []const u8,
) void {
const rows = astr.len + 1;
const cols = bstr.len + 1;
std.mem.set(usize, lcs.data[0 .. rows * cols], 0);
// This approach is a dynamic programming technique to calculate the
// longest common subsequence between two strings, `a` and `b`. We start
// at 1 for `i` and `j` because the first column and first row are always
// set to zero
//
// You can find more information about this at the following url:
// https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
var i: usize = 1;
while (i < rows) : (i += 1) {
var j: usize = 1;
while (j < cols) : (j += 1) {
if (astr[i - 1] == bstr[j - 1]) {
lcs.get(i, j).* = lcs.get(i - 1, j - 1).* + 1;
} else {
lcs.get(i, j).* = std.math.max(
lcs.get(i - 1, j).*,
lcs.get(i, j - 1).*,
);
}
}
}
}
pub fn get_changes(
lcs: *Array2D,
a: []const u8,
a_trim_offset: usize,
a_trim: []const u8,
b_trim: []const u8,
allocator: std.mem.Allocator,
) Error!std.ArrayListUnmanaged(types.TextEdit) {
// First we get a list of changes between strings at the character level:
// "addition", "deletion", and "no change" for each character
var changes = try std.ArrayListUnmanaged(Change).initCapacity(allocator, a_trim.len);
defer changes.deinit(allocator);
try recur_changes(
lcs,
&changes,
a_trim,
b_trim,
@intCast(i64, a_trim.len),
@intCast(i64, b_trim.len),
allocator,
);
// We want to group runs of deletions and additions, and separate them by
// runs of `.Nothing` changes. This will allow us to calculate the
// `TextEdit` ranges
var groups = std.ArrayListUnmanaged([]Change){};
defer groups.deinit(allocator);
var active_change: ?[]Change = null;
for (changes.items) |ch, i| {
switch (ch.operation) {
.Addition, .Deletion => {
if (active_change == null) {
active_change = changes.items[i..];
}
var offset: usize = 0;
for (diffs.items) |diff| {
var start = offset;
switch (diff.operation) {
.delete => {
offset += diff.text.len;
try eds.append(allocator, .{ .range = offsets.locToRange(before, .{ .start = start, .end = offset }, encoding), .newText = "" });
},
.Nothing => {
if (active_change) |*ac| {
ac.* = ac.*[0..(i - (changes.items.len - ac.*.len))];
try groups.append(allocator, ac.*);
active_change = null;
}
.equal => {
offset += diff.text.len;
},
.insert => {
try eds.append(allocator, .{ .range = offsets.locToRange(before, .{ .start = start, .end = start }, encoding), .newText = diff.text });
},
}
}
if (active_change) |*ac| {
ac.* = ac.*[0..(changes.items.len - (changes.items.len - ac.*.len))];
try groups.append(allocator, ac.*);
return eds;
}
// The LCS algorithm works "in reverse", so we're putting everything back
// in ascending order
var a_lines = std.mem.split(u8, a, "\n");
std.mem.reverse([]Change, groups.items);
for (groups.items) |group| std.mem.reverse(Change, group);
var edit_results = try std.ArrayListUnmanaged(types.TextEdit).initCapacity(allocator, groups.items.len);
errdefer {
for (edit_results.items) |edit| {
allocator.free(edit.newText);
}
edit_results.deinit(allocator);
}
// Convert our grouped changes into `Edit`s
for (groups.items) |group| {
var range_start = group[0].pos;
var range_len: usize = 0;
var newText = std.ArrayListUnmanaged(u8){};
errdefer newText.deinit(allocator);
for (group) |ch| {
switch (ch.operation) {
.Addition => try newText.append(allocator, ch.value.?),
.Deletion => range_len += 1,
else => {},
}
}
var range = try char_pos_to_range(
&a_lines,
a_trim_offset + range_start,
a_trim_offset + range_start + range_len,
);
a_lines.reset();
edit_results.appendAssumeCapacity(.{
.range = range,
.newText = try newText.toOwnedSlice(allocator),
});
}
return edit_results;
}
fn recur_changes(
lcs: *Array2D,
changes: *std.ArrayListUnmanaged(Change),
a: []const u8,
b: []const u8,
i: i64,
j: i64,
allocator: std.mem.Allocator,
) error{OutOfMemory}!void {
// This function recursively works backwards through the LCS table in
// order to figure out what kind of changes took place to transform `a`
// into `b`
const ii = @intCast(usize, i);
const jj = @intCast(usize, j);
if (i > 0 and j > 0 and a[ii - 1] == b[jj - 1]) {
try changes.append(allocator, .{
.operation = .Nothing,
.pos = ii - 1,
.value = null,
});
try recur_changes(lcs, changes, a, b, i - 1, j - 1, allocator);
} else if (j > 0 and (i == 0 or lcs.get(ii, jj - 1).* >= lcs.get(ii - 1, jj).*)) {
try changes.append(allocator, .{
.operation = .Addition,
.pos = ii,
.value = b[jj - 1],
});
try recur_changes(lcs, changes, a, b, i, j - 1, allocator);
} else if (i > 0 and (j == 0 or lcs.get(ii, jj - 1).* < lcs.get(ii - 1, jj).*)) {
try changes.append(allocator, .{
.operation = .Deletion,
.pos = ii - 1,
.value = a[ii - 1],
});
try recur_changes(lcs, changes, a, b, i - 1, j, allocator);
}
}
/// Accept a range that is solely based on buffer/character position and
/// convert it to line number & character position range
fn char_pos_to_range(
lines: *std.mem.SplitIterator(u8),
start: usize,
end: usize,
) Error!types.Range {
var char_pos: usize = 0;
var line_pos: usize = 0;
var result_start_pos: ?types.Position = null;
var result_end_pos: ?types.Position = null;
while (lines.next()) |line| : ({
char_pos += line.len + 1;
line_pos += 1;
}) {
if (start >= char_pos and start <= char_pos + line.len) {
result_start_pos = .{
.line = @intCast(u32, line_pos),
.character = @intCast(u32, start - char_pos),
};
}
if (end >= char_pos and end <= char_pos + line.len) {
result_end_pos = .{
.line = @intCast(u32, line_pos),
.character = @intCast(u32, end - char_pos),
};
}
}
if (result_start_pos == null) return error.InvalidRange;
// If we did not find an end position, it is outside the range of the
// string for some reason so clamp it to the string end position
if (result_end_pos == null) {
result_end_pos = types.Position{
.line = @intCast(u32, line_pos),
.character = @intCast(u32, char_pos),
};
}
return types.Range{
.start = result_start_pos.?,
.end = result_end_pos.?,
};
}
// Caller owns returned memory.
pub fn applyTextEdits(
/// Caller owns returned memory.
/// NOTE: As far as I know, this implementation is actually incorrect
/// as we use intermediate state, but at the same time, it works so
/// I really don't want to touch it right now. TODO: Investigate + fix.
pub fn applyContentChanges(
allocator: std.mem.Allocator,
text: []const u8,
content_changes: []const types.TextDocumentContentChangeEvent,
@ -385,3 +74,41 @@ pub fn applyTextEdits(
return try text_array.toOwnedSliceSentinel(allocator, 0);
}
// https://cs.opensource.google/go/x/tools/+/master:internal/lsp/diff/diff.go;l=40
fn textEditLessThan(_: void, lhs: types.TextEdit, rhs: types.TextEdit) bool {
return offsets.rangeLessThan(lhs.range, rhs.range);
}
/// Caller owns returned memory.
pub fn applyTextEdits(
allocator: std.mem.Allocator,
text: []const u8,
text_edits: []const types.TextEdit,
encoding: offsets.Encoding,
) ![]const u8 {
var text_edits_sortable = try allocator.alloc(types.TextEdit, text_edits.len);
defer allocator.free(text_edits_sortable);
std.mem.copy(types.TextEdit, text_edits_sortable, text_edits);
std.sort.sort(types.TextEdit, text_edits_sortable, {}, textEditLessThan);
var final_text = std.ArrayListUnmanaged(u8){};
var last: usize = 0;
for (text_edits_sortable) |te| {
const start = offsets.maybePositionToIndex(text, te.range.start, encoding) orelse text.len;
if (start > last) {
try final_text.appendSlice(allocator, text[last..start]);
last = start;
}
try final_text.appendSlice(allocator, te.newText);
last = offsets.maybePositionToIndex(text, te.range.end, encoding) orelse text.len;
}
if (last < text.len) {
try final_text.appendSlice(allocator, text[last..]);
}
return try final_text.toOwnedSlice(allocator);
}

1
src/diffz Submodule

@ -0,0 +1 @@
Subproject commit c36d8817926ca9c35d4584b6cb3c497982ae6e37

View File

@ -17,6 +17,25 @@ pub fn indexToPosition(text: []const u8, index: usize, encoding: Encoding) types
};
}
pub fn maybePositionToIndex(text: []const u8, position: types.Position, encoding: Encoding) ?usize {
var line: u32 = 0;
var line_start_index: usize = 0;
for (text) |c, i| {
if (line == position.line) break;
if (c == '\n') {
line += 1;
line_start_index = i + 1;
}
}
if (line != position.line) return null;
const line_text = std.mem.sliceTo(text[line_start_index..], '\n');
const line_byte_length = getNCodeUnitByteCount(line_text, position.character, encoding);
return line_start_index + line_byte_length;
}
pub fn positionToIndex(text: []const u8, position: types.Position, encoding: Encoding) usize {
var line: u32 = 0;
var line_start_index: usize = 0;
@ -333,3 +352,22 @@ pub fn getNCodeUnitByteCount(text: []const u8, n: usize, encoding: Encoding) usi
},
}
}
pub fn rangeLessThan(a: types.Range, b: types.Range) bool {
return positionLessThan(a.start, b.start) or positionLessThan(a.end, b.end);
}
pub fn positionLessThan(a: types.Position, b: types.Position) bool {
if (a.line < b.line) {
return true;
}
if (a.line > b.line) {
return false;
}
if (a.character < b.character) {
return true;
}
return false;
}

View File

@ -13,3 +13,4 @@ pub const types = @import("lsp.zig");
pub const URI = @import("uri.zig");
pub const DocumentStore = @import("DocumentStore.zig");
pub const ComptimeInterpreter = @import("ComptimeInterpreter.zig");
pub const diff = @import("diff.zig");

View File

@ -5,6 +5,7 @@ comptime {
_ = @import("utility/offsets.zig");
_ = @import("utility/position_context.zig");
_ = @import("utility/uri.zig");
_ = @import("utility/diff.zig");
// TODO Lifecycle Messages

30
tests/utility/diff.zig Normal file
View File

@ -0,0 +1,30 @@
const std = @import("std");
const zls = @import("zls");
const allocator = std.testing.allocator;
fn gen(alloc: std.mem.Allocator, rand: std.rand.Random) ![]const u8 {
var buffer = try alloc.alloc(u8, rand.intRangeAtMost(usize, 16, 1024));
for (buffer) |*b| b.* = rand.intRangeAtMost(u8, 32, 126);
return buffer;
}
test "diff - random" {
var arena = std.heap.ArenaAllocator.init(allocator);
defer arena.deinit();
var rand = std.rand.DefaultPrng.init(0);
var index: usize = 0;
while (index < 100) : (index += 1) {
defer _ = arena.reset(.retain_capacity);
const pre = try gen(arena.allocator(), rand.random());
const post = try gen(arena.allocator(), rand.random());
var edits = try zls.diff.edits(arena.allocator(), pre, post, .@"utf-8");
const applied = try zls.diff.applyTextEdits(arena.allocator(), pre, edits.items, .@"utf-8");
try std.testing.expectEqualStrings(post, applied);
}
}