From bf19ed3ea9970bdff65bdf63b3d9a3f1b7a9753b Mon Sep 17 00:00:00 2001 From: Auguste Rame <19855629+SuperAuguste@users.noreply.github.com> Date: Sat, 11 Feb 2023 14:21:10 -0500 Subject: [PATCH] Switch to Diff Match Patch (diffz) (#982) * Add tests, note about correctness issue * Use diffz (DiffMatchPatch) --- .gitmodules | 3 + build.zig | 8 + src/Server.zig | 4 +- src/diff.zig | 405 +++++++---------------------------------- src/diffz | 1 + src/offsets.zig | 38 ++++ src/zls.zig | 1 + tests/tests.zig | 1 + tests/utility/diff.zig | 30 +++ 9 files changed, 150 insertions(+), 341 deletions(-) create mode 160000 src/diffz create mode 100644 tests/utility/diff.zig diff --git a/.gitmodules b/.gitmodules index 7e91343..a2eb702 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,6 @@ [submodule "src/tres"] path = src/tres url = https://github.com/ziglibs/tres.git +[submodule "src/diffz"] + path = src/diffz + url = https://github.com/ziglibs/diffz diff --git a/build.zig b/build.zig index 62aee8c..2ae68c5 100644 --- a/build.zig +++ b/build.zig @@ -120,9 +120,15 @@ pub fn build(b: *std.build.Builder) !void { const tres_module = b.createModule(.{ .source_file = .{ .path = tres_path } }); exe.addModule("tres", tres_module); + const DIFFZ_DEFAULT_PATH = "src/diffz/DiffMatchPatch.zig"; + const diffz_path = b.option([]const u8, "diffz", "Path to diffz package (default: " ++ DIFFZ_DEFAULT_PATH ++ ")") orelse DIFFZ_DEFAULT_PATH; + const diffz_module = b.createModule(.{ .source_file = .{ .path = diffz_path } }); + exe.addModule("diffz", diffz_module); + const check_submodules_step = CheckSubmodulesStep.init(b, &.{ known_folders_path, tres_path, + diffz_path, }); b.getInstallStep().dependOn(&check_submodules_step.step); @@ -201,11 +207,13 @@ pub fn build(b: *std.build.Builder) !void { .dependencies = &.{ .{ .name = "known-folders", .module = known_folders_module }, .{ .name = "tres", .module = tres_module }, + .{ .name = "diffz", .module = diffz_module }, .{ .name = "build_options", .module = build_options_module }, }, }); tests.addModule("zls", zls_module); tests.addModule("tres", tres_module); + tests.addModule("diffz", diffz_module); test_step.dependOn(&tests.step); } diff --git a/src/Server.zig b/src/Server.zig index f2c1964..a1f75f5 100644 --- a/src/Server.zig +++ b/src/Server.zig @@ -2087,7 +2087,7 @@ fn changeDocumentHandler(server: *Server, notification: types.DidChangeTextDocum const handle = server.document_store.getHandle(notification.textDocument.uri) orelse return; - const new_text = try diff.applyTextEdits(server.allocator, handle.text, notification.contentChanges, server.offset_encoding); + const new_text = try diff.applyContentChanges(server.allocator, handle.text, notification.contentChanges, server.offset_encoding); try server.document_store.refreshDocument(handle.uri, new_text); @@ -2372,7 +2372,7 @@ fn formattingHandler(server: *Server, request: types.DocumentFormattingParams) E return text_edits; } - return if (diff.edits(allocator, handle.text, formatted)) |text_edits| text_edits.items else |_| null; + return if (diff.edits(allocator, handle.text, formatted, server.offset_encoding)) |text_edits| text_edits.items else |_| null; } fn didChangeConfigurationHandler(server: *Server, request: configuration.DidChangeConfigurationParams) Error!void { diff --git a/src/diff.zig b/src/diff.zig index 49bf4f7..be378da 100644 --- a/src/diff.zig +++ b/src/diff.zig @@ -1,358 +1,47 @@ const std = @import("std"); const types = @import("lsp.zig"); const offsets = @import("offsets.zig"); +const DiffMatchPatch = @import("diffz"); -pub const Error = error{ OutOfMemory, InvalidRange }; - -// Whether the `Change` is an addition, deletion, or no change from the -// original string to the new string -const Operation = enum { Deletion, Addition, Nothing }; - -/// A single character difference between two strings -const Change = struct { - operation: Operation, - pos: usize, - value: ?u8, +const dmp = DiffMatchPatch{ + .diff_timeout = 250, }; -/// Given two input strings, `a` and `b`, return a list of Edits that -/// describe the changes from `a` to `b` +pub const Error = error{ OutOfMemory, InvalidRange, UnknownError }; + pub fn edits( allocator: std.mem.Allocator, - a: []const u8, - b: []const u8, + before: []const u8, + after: []const u8, + encoding: offsets.Encoding, ) Error!std.ArrayListUnmanaged(types.TextEdit) { - // Given the input strings A and B, we skip over the first N characters - // where A[0..N] == B[0..N]. We want to trim the start (and end) of the - // strings that have the same text. This decreases the size of the LCS - // table and makes the diff comparison more efficient - var a_trim: []const u8 = a; - var b_trim: []const u8 = b; - const a_trim_offset = trim_input(&a_trim, &b_trim); + var diffs = try dmp.diff(allocator, before, after, true); + var eds = std.ArrayListUnmanaged(types.TextEdit){}; - const rows = a_trim.len + 1; - const cols = b_trim.len + 1; - - var lcs = try Array2D.new(allocator, rows, cols); - defer lcs.deinit(); - - calculate_lcs(&lcs, a_trim, b_trim); - - return try get_changes( - &lcs, - a, - a_trim_offset, - a_trim, - b_trim, - allocator, - ); -} - -fn trim_input(a_out: *[]const u8, b_out: *[]const u8) usize { - if (a_out.len == 0 or b_out.len == 0) return 0; - - var a: []const u8 = a_out.*; - var b: []const u8 = b_out.*; - - // Trim the beginning of the string - var start: usize = 0; - while (start < a.len and start < b.len and a[start] == b[start]) : ({ - start += 1; - }) {} - - // Trim the end of the string - var end: usize = 1; - while (end < a.len and end < b.len and a[a.len - end] == b[b.len - end]) : ({ - end += 1; - }) {} - end -= 1; - - var a_start = start; - var a_end = a.len - end; - var b_start = start; - var b_end = b.len - end; - - // In certain situations, the trimmed range can be "negative" where - // `a_start` ends up being after `a_end` in the byte stream. If you - // consider the following inputs: - // a: "xx gg xx" - // b: "xx gg xx" - // - // This will lead to the following calculations: - // a_start: 4 - // a_end: 4 - // b_start: 4 - // b_end: 2 - // - // In negative range situations, we add the absolute value of the - // the negative range's length (`b_start - b_end` in this case) to the - // other range's length (a_end + (b_start - b_end)), and then set the - // negative range end to the negative range start (b_end = b_start) - if (a_start > a_end) { - const difference = a_start - a_end; - a_end = a_start; - b_end += difference; - } - if (b_start > b_end) { - const difference = b_start - b_end; - b_end = b_start; - a_end += difference; - } - - a_out.* = a[a_start..a_end]; - b_out.* = b[b_start..b_end]; - - return start; -} - -/// A 2D array that is addressable as a[row, col] -pub const Array2D = struct { - const Self = @This(); - - data: [*]usize, - allocator: std.mem.Allocator, - rows: usize, - cols: usize, - - pub fn new( - allocator: std.mem.Allocator, - rows: usize, - cols: usize, - ) error{OutOfMemory}!Self { - const data = try allocator.alloc(usize, rows * cols); - - return Self{ - .data = data.ptr, - .allocator = allocator, - .rows = rows, - .cols = cols, - }; - } - - pub fn deinit(self: *Self) void { - self.allocator.free(self.data[0 .. self.rows * self.cols]); - } - - pub fn get(self: *Self, row: usize, col: usize) *usize { - return @ptrCast(*usize, self.data + (row * self.cols) + col); - } -}; - -/// Build a Longest Common Subsequence table -fn calculate_lcs( - lcs: *Array2D, - astr: []const u8, - bstr: []const u8, -) void { - const rows = astr.len + 1; - const cols = bstr.len + 1; - - std.mem.set(usize, lcs.data[0 .. rows * cols], 0); - - // This approach is a dynamic programming technique to calculate the - // longest common subsequence between two strings, `a` and `b`. We start - // at 1 for `i` and `j` because the first column and first row are always - // set to zero - // - // You can find more information about this at the following url: - // https://en.wikipedia.org/wiki/Longest_common_subsequence_problem - var i: usize = 1; - while (i < rows) : (i += 1) { - var j: usize = 1; - while (j < cols) : (j += 1) { - if (astr[i - 1] == bstr[j - 1]) { - lcs.get(i, j).* = lcs.get(i - 1, j - 1).* + 1; - } else { - lcs.get(i, j).* = std.math.max( - lcs.get(i - 1, j).*, - lcs.get(i, j - 1).*, - ); - } - } - } -} - -pub fn get_changes( - lcs: *Array2D, - a: []const u8, - a_trim_offset: usize, - a_trim: []const u8, - b_trim: []const u8, - allocator: std.mem.Allocator, -) Error!std.ArrayListUnmanaged(types.TextEdit) { - // First we get a list of changes between strings at the character level: - // "addition", "deletion", and "no change" for each character - var changes = try std.ArrayListUnmanaged(Change).initCapacity(allocator, a_trim.len); - defer changes.deinit(allocator); - try recur_changes( - lcs, - &changes, - a_trim, - b_trim, - @intCast(i64, a_trim.len), - @intCast(i64, b_trim.len), - allocator, - ); - - // We want to group runs of deletions and additions, and separate them by - // runs of `.Nothing` changes. This will allow us to calculate the - // `TextEdit` ranges - var groups = std.ArrayListUnmanaged([]Change){}; - defer groups.deinit(allocator); - - var active_change: ?[]Change = null; - for (changes.items) |ch, i| { - switch (ch.operation) { - .Addition, .Deletion => { - if (active_change == null) { - active_change = changes.items[i..]; - } + var offset: usize = 0; + for (diffs.items) |diff| { + var start = offset; + switch (diff.operation) { + .delete => { + offset += diff.text.len; + try eds.append(allocator, .{ .range = offsets.locToRange(before, .{ .start = start, .end = offset }, encoding), .newText = "" }); }, - .Nothing => { - if (active_change) |*ac| { - ac.* = ac.*[0..(i - (changes.items.len - ac.*.len))]; - try groups.append(allocator, ac.*); - active_change = null; - } + .equal => { + offset += diff.text.len; + }, + .insert => { + try eds.append(allocator, .{ .range = offsets.locToRange(before, .{ .start = start, .end = start }, encoding), .newText = diff.text }); }, } } - if (active_change) |*ac| { - ac.* = ac.*[0..(changes.items.len - (changes.items.len - ac.*.len))]; - try groups.append(allocator, ac.*); - } - - // The LCS algorithm works "in reverse", so we're putting everything back - // in ascending order - var a_lines = std.mem.split(u8, a, "\n"); - std.mem.reverse([]Change, groups.items); - for (groups.items) |group| std.mem.reverse(Change, group); - - var edit_results = try std.ArrayListUnmanaged(types.TextEdit).initCapacity(allocator, groups.items.len); - errdefer { - for (edit_results.items) |edit| { - allocator.free(edit.newText); - } - edit_results.deinit(allocator); - } - - // Convert our grouped changes into `Edit`s - for (groups.items) |group| { - var range_start = group[0].pos; - var range_len: usize = 0; - var newText = std.ArrayListUnmanaged(u8){}; - errdefer newText.deinit(allocator); - for (group) |ch| { - switch (ch.operation) { - .Addition => try newText.append(allocator, ch.value.?), - .Deletion => range_len += 1, - else => {}, - } - } - var range = try char_pos_to_range( - &a_lines, - a_trim_offset + range_start, - a_trim_offset + range_start + range_len, - ); - a_lines.reset(); - edit_results.appendAssumeCapacity(.{ - .range = range, - .newText = try newText.toOwnedSlice(allocator), - }); - } - - return edit_results; + return eds; } -fn recur_changes( - lcs: *Array2D, - changes: *std.ArrayListUnmanaged(Change), - a: []const u8, - b: []const u8, - i: i64, - j: i64, - allocator: std.mem.Allocator, -) error{OutOfMemory}!void { - // This function recursively works backwards through the LCS table in - // order to figure out what kind of changes took place to transform `a` - // into `b` - - const ii = @intCast(usize, i); - const jj = @intCast(usize, j); - - if (i > 0 and j > 0 and a[ii - 1] == b[jj - 1]) { - try changes.append(allocator, .{ - .operation = .Nothing, - .pos = ii - 1, - .value = null, - }); - try recur_changes(lcs, changes, a, b, i - 1, j - 1, allocator); - } else if (j > 0 and (i == 0 or lcs.get(ii, jj - 1).* >= lcs.get(ii - 1, jj).*)) { - try changes.append(allocator, .{ - .operation = .Addition, - .pos = ii, - .value = b[jj - 1], - }); - try recur_changes(lcs, changes, a, b, i, j - 1, allocator); - } else if (i > 0 and (j == 0 or lcs.get(ii, jj - 1).* < lcs.get(ii - 1, jj).*)) { - try changes.append(allocator, .{ - .operation = .Deletion, - .pos = ii - 1, - .value = a[ii - 1], - }); - try recur_changes(lcs, changes, a, b, i - 1, j, allocator); - } -} - -/// Accept a range that is solely based on buffer/character position and -/// convert it to line number & character position range -fn char_pos_to_range( - lines: *std.mem.SplitIterator(u8), - start: usize, - end: usize, -) Error!types.Range { - var char_pos: usize = 0; - var line_pos: usize = 0; - var result_start_pos: ?types.Position = null; - var result_end_pos: ?types.Position = null; - - while (lines.next()) |line| : ({ - char_pos += line.len + 1; - line_pos += 1; - }) { - if (start >= char_pos and start <= char_pos + line.len) { - result_start_pos = .{ - .line = @intCast(u32, line_pos), - .character = @intCast(u32, start - char_pos), - }; - } - if (end >= char_pos and end <= char_pos + line.len) { - result_end_pos = .{ - .line = @intCast(u32, line_pos), - .character = @intCast(u32, end - char_pos), - }; - } - } - - if (result_start_pos == null) return error.InvalidRange; - - // If we did not find an end position, it is outside the range of the - // string for some reason so clamp it to the string end position - if (result_end_pos == null) { - result_end_pos = types.Position{ - .line = @intCast(u32, line_pos), - .character = @intCast(u32, char_pos), - }; - } - - return types.Range{ - .start = result_start_pos.?, - .end = result_end_pos.?, - }; -} - -// Caller owns returned memory. -pub fn applyTextEdits( +/// Caller owns returned memory. +/// NOTE: As far as I know, this implementation is actually incorrect +/// as we use intermediate state, but at the same time, it works so +/// I really don't want to touch it right now. TODO: Investigate + fix. +pub fn applyContentChanges( allocator: std.mem.Allocator, text: []const u8, content_changes: []const types.TextDocumentContentChangeEvent, @@ -385,3 +74,41 @@ pub fn applyTextEdits( return try text_array.toOwnedSliceSentinel(allocator, 0); } + +// https://cs.opensource.google/go/x/tools/+/master:internal/lsp/diff/diff.go;l=40 + +fn textEditLessThan(_: void, lhs: types.TextEdit, rhs: types.TextEdit) bool { + return offsets.rangeLessThan(lhs.range, rhs.range); +} + +/// Caller owns returned memory. +pub fn applyTextEdits( + allocator: std.mem.Allocator, + text: []const u8, + text_edits: []const types.TextEdit, + encoding: offsets.Encoding, +) ![]const u8 { + var text_edits_sortable = try allocator.alloc(types.TextEdit, text_edits.len); + defer allocator.free(text_edits_sortable); + + std.mem.copy(types.TextEdit, text_edits_sortable, text_edits); + std.sort.sort(types.TextEdit, text_edits_sortable, {}, textEditLessThan); + + var final_text = std.ArrayListUnmanaged(u8){}; + + var last: usize = 0; + for (text_edits_sortable) |te| { + const start = offsets.maybePositionToIndex(text, te.range.start, encoding) orelse text.len; + if (start > last) { + try final_text.appendSlice(allocator, text[last..start]); + last = start; + } + try final_text.appendSlice(allocator, te.newText); + last = offsets.maybePositionToIndex(text, te.range.end, encoding) orelse text.len; + } + if (last < text.len) { + try final_text.appendSlice(allocator, text[last..]); + } + + return try final_text.toOwnedSlice(allocator); +} diff --git a/src/diffz b/src/diffz new file mode 160000 index 0000000..c36d881 --- /dev/null +++ b/src/diffz @@ -0,0 +1 @@ +Subproject commit c36d8817926ca9c35d4584b6cb3c497982ae6e37 diff --git a/src/offsets.zig b/src/offsets.zig index 38b8006..be68555 100644 --- a/src/offsets.zig +++ b/src/offsets.zig @@ -17,6 +17,25 @@ pub fn indexToPosition(text: []const u8, index: usize, encoding: Encoding) types }; } +pub fn maybePositionToIndex(text: []const u8, position: types.Position, encoding: Encoding) ?usize { + var line: u32 = 0; + var line_start_index: usize = 0; + for (text) |c, i| { + if (line == position.line) break; + if (c == '\n') { + line += 1; + line_start_index = i + 1; + } + } + + if (line != position.line) return null; + + const line_text = std.mem.sliceTo(text[line_start_index..], '\n'); + const line_byte_length = getNCodeUnitByteCount(line_text, position.character, encoding); + + return line_start_index + line_byte_length; +} + pub fn positionToIndex(text: []const u8, position: types.Position, encoding: Encoding) usize { var line: u32 = 0; var line_start_index: usize = 0; @@ -333,3 +352,22 @@ pub fn getNCodeUnitByteCount(text: []const u8, n: usize, encoding: Encoding) usi }, } } + +pub fn rangeLessThan(a: types.Range, b: types.Range) bool { + return positionLessThan(a.start, b.start) or positionLessThan(a.end, b.end); +} + +pub fn positionLessThan(a: types.Position, b: types.Position) bool { + if (a.line < b.line) { + return true; + } + if (a.line > b.line) { + return false; + } + + if (a.character < b.character) { + return true; + } + + return false; +} diff --git a/src/zls.zig b/src/zls.zig index cdd9b8a..2779184 100644 --- a/src/zls.zig +++ b/src/zls.zig @@ -13,3 +13,4 @@ pub const types = @import("lsp.zig"); pub const URI = @import("uri.zig"); pub const DocumentStore = @import("DocumentStore.zig"); pub const ComptimeInterpreter = @import("ComptimeInterpreter.zig"); +pub const diff = @import("diff.zig"); diff --git a/tests/tests.zig b/tests/tests.zig index b7719af..ebe7bce 100644 --- a/tests/tests.zig +++ b/tests/tests.zig @@ -5,6 +5,7 @@ comptime { _ = @import("utility/offsets.zig"); _ = @import("utility/position_context.zig"); _ = @import("utility/uri.zig"); + _ = @import("utility/diff.zig"); // TODO Lifecycle Messages diff --git a/tests/utility/diff.zig b/tests/utility/diff.zig new file mode 100644 index 0000000..0c4ae54 --- /dev/null +++ b/tests/utility/diff.zig @@ -0,0 +1,30 @@ +const std = @import("std"); +const zls = @import("zls"); + +const allocator = std.testing.allocator; + +fn gen(alloc: std.mem.Allocator, rand: std.rand.Random) ![]const u8 { + var buffer = try alloc.alloc(u8, rand.intRangeAtMost(usize, 16, 1024)); + for (buffer) |*b| b.* = rand.intRangeAtMost(u8, 32, 126); + return buffer; +} + +test "diff - random" { + var arena = std.heap.ArenaAllocator.init(allocator); + defer arena.deinit(); + + var rand = std.rand.DefaultPrng.init(0); + + var index: usize = 0; + + while (index < 100) : (index += 1) { + defer _ = arena.reset(.retain_capacity); + + const pre = try gen(arena.allocator(), rand.random()); + const post = try gen(arena.allocator(), rand.random()); + + var edits = try zls.diff.edits(arena.allocator(), pre, post, .@"utf-8"); + const applied = try zls.diff.applyTextEdits(arena.allocator(), pre, edits.items, .@"utf-8"); + try std.testing.expectEqualStrings(post, applied); + } +}