From bf19ed3ea9970bdff65bdf63b3d9a3f1b7a9753b Mon Sep 17 00:00:00 2001
From: Auguste Rame <19855629+SuperAuguste@users.noreply.github.com>
Date: Sat, 11 Feb 2023 14:21:10 -0500
Subject: [PATCH] Switch to Diff Match Patch (diffz) (#982)

* Add tests, note about correctness issue

* Use diffz (DiffMatchPatch)
---
 .gitmodules            |   3 +
 build.zig              |   8 +
 src/Server.zig         |   4 +-
 src/diff.zig           | 405 +++++++----------------------------------
 src/diffz              |   1 +
 src/offsets.zig        |  38 ++++
 src/zls.zig            |   1 +
 tests/tests.zig        |   1 +
 tests/utility/diff.zig |  30 +++
 9 files changed, 150 insertions(+), 341 deletions(-)
 create mode 160000 src/diffz
 create mode 100644 tests/utility/diff.zig

diff --git a/.gitmodules b/.gitmodules
index 7e91343..a2eb702 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "src/tres"]
 	path = src/tres
 	url = https://github.com/ziglibs/tres.git
+[submodule "src/diffz"]
+	path = src/diffz
+	url = https://github.com/ziglibs/diffz
diff --git a/build.zig b/build.zig
index 62aee8c..2ae68c5 100644
--- a/build.zig
+++ b/build.zig
@@ -120,9 +120,15 @@ pub fn build(b: *std.build.Builder) !void {
     const tres_module = b.createModule(.{ .source_file = .{ .path = tres_path } });
     exe.addModule("tres", tres_module);
 
+    const DIFFZ_DEFAULT_PATH = "src/diffz/DiffMatchPatch.zig";
+    const diffz_path = b.option([]const u8, "diffz", "Path to diffz package (default: " ++ DIFFZ_DEFAULT_PATH ++ ")") orelse DIFFZ_DEFAULT_PATH;
+    const diffz_module = b.createModule(.{ .source_file = .{ .path = diffz_path } });
+    exe.addModule("diffz", diffz_module);
+
     const check_submodules_step = CheckSubmodulesStep.init(b, &.{
         known_folders_path,
         tres_path,
+        diffz_path,
     });
     b.getInstallStep().dependOn(&check_submodules_step.step);
 
@@ -201,11 +207,13 @@ pub fn build(b: *std.build.Builder) !void {
         .dependencies = &.{
             .{ .name = "known-folders", .module = known_folders_module },
             .{ .name = "tres", .module = tres_module },
+            .{ .name = "diffz", .module = diffz_module },
             .{ .name = "build_options", .module = build_options_module },
         },
     });
     tests.addModule("zls", zls_module);
     tests.addModule("tres", tres_module);
+    tests.addModule("diffz", diffz_module);
 
     test_step.dependOn(&tests.step);
 }
diff --git a/src/Server.zig b/src/Server.zig
index f2c1964..a1f75f5 100644
--- a/src/Server.zig
+++ b/src/Server.zig
@@ -2087,7 +2087,7 @@ fn changeDocumentHandler(server: *Server, notification: types.DidChangeTextDocum
 
     const handle = server.document_store.getHandle(notification.textDocument.uri) orelse return;
 
-    const new_text = try diff.applyTextEdits(server.allocator, handle.text, notification.contentChanges, server.offset_encoding);
+    const new_text = try diff.applyContentChanges(server.allocator, handle.text, notification.contentChanges, server.offset_encoding);
 
     try server.document_store.refreshDocument(handle.uri, new_text);
 
@@ -2372,7 +2372,7 @@ fn formattingHandler(server: *Server, request: types.DocumentFormattingParams) E
         return text_edits;
     }
 
-    return if (diff.edits(allocator, handle.text, formatted)) |text_edits| text_edits.items else |_| null;
+    return if (diff.edits(allocator, handle.text, formatted, server.offset_encoding)) |text_edits| text_edits.items else |_| null;
 }
 
 fn didChangeConfigurationHandler(server: *Server, request: configuration.DidChangeConfigurationParams) Error!void {
diff --git a/src/diff.zig b/src/diff.zig
index 49bf4f7..be378da 100644
--- a/src/diff.zig
+++ b/src/diff.zig
@@ -1,358 +1,47 @@
 const std = @import("std");
 const types = @import("lsp.zig");
 const offsets = @import("offsets.zig");
+const DiffMatchPatch = @import("diffz");
 
-pub const Error = error{ OutOfMemory, InvalidRange };
-
-// Whether the `Change` is an addition, deletion, or no change from the
-// original string to the new string
-const Operation = enum { Deletion, Addition, Nothing };
-
-/// A single character difference between two strings
-const Change = struct {
-    operation: Operation,
-    pos: usize,
-    value: ?u8,
+const dmp = DiffMatchPatch{
+    .diff_timeout = 250,
 };
 
-/// Given two input strings, `a` and `b`, return a list of Edits that
-/// describe the changes from `a` to `b`
+pub const Error = error{ OutOfMemory, InvalidRange, UnknownError };
+
 pub fn edits(
     allocator: std.mem.Allocator,
-    a: []const u8,
-    b: []const u8,
+    before: []const u8,
+    after: []const u8,
+    encoding: offsets.Encoding,
 ) Error!std.ArrayListUnmanaged(types.TextEdit) {
-    // Given the input strings A and B, we skip over the first N characters
-    // where A[0..N] == B[0..N]. We want to trim the start (and end) of the
-    // strings that have the same text. This decreases the size of the LCS
-    // table and makes the diff comparison more efficient
-    var a_trim: []const u8 = a;
-    var b_trim: []const u8 = b;
-    const a_trim_offset = trim_input(&a_trim, &b_trim);
+    var diffs = try dmp.diff(allocator, before, after, true);
+    var eds = std.ArrayListUnmanaged(types.TextEdit){};
 
-    const rows = a_trim.len + 1;
-    const cols = b_trim.len + 1;
-
-    var lcs = try Array2D.new(allocator, rows, cols);
-    defer lcs.deinit();
-
-    calculate_lcs(&lcs, a_trim, b_trim);
-
-    return try get_changes(
-        &lcs,
-        a,
-        a_trim_offset,
-        a_trim,
-        b_trim,
-        allocator,
-    );
-}
-
-fn trim_input(a_out: *[]const u8, b_out: *[]const u8) usize {
-    if (a_out.len == 0 or b_out.len == 0) return 0;
-
-    var a: []const u8 = a_out.*;
-    var b: []const u8 = b_out.*;
-
-    // Trim the beginning of the string
-    var start: usize = 0;
-    while (start < a.len and start < b.len and a[start] == b[start]) : ({
-        start += 1;
-    }) {}
-
-    // Trim the end of the string
-    var end: usize = 1;
-    while (end < a.len and end < b.len and a[a.len - end] == b[b.len - end]) : ({
-        end += 1;
-    }) {}
-    end -= 1;
-
-    var a_start = start;
-    var a_end = a.len - end;
-    var b_start = start;
-    var b_end = b.len - end;
-
-    // In certain situations, the trimmed range can be "negative" where
-    // `a_start` ends up being after `a_end` in the byte stream. If you
-    // consider the following inputs:
-    //     a: "xx    gg  xx"
-    //     b: "xx  gg  xx"
-    //
-    // This will lead to the following calculations:
-    //     a_start: 4
-    //     a_end: 4
-    //     b_start: 4
-    //     b_end: 2
-    //
-    // In negative range situations, we add the absolute value of the
-    // the negative range's length (`b_start - b_end` in this case) to the
-    // other range's length (a_end + (b_start - b_end)), and then set the
-    // negative range end to the negative range start (b_end = b_start)
-    if (a_start > a_end) {
-        const difference = a_start - a_end;
-        a_end = a_start;
-        b_end += difference;
-    }
-    if (b_start > b_end) {
-        const difference = b_start - b_end;
-        b_end = b_start;
-        a_end += difference;
-    }
-
-    a_out.* = a[a_start..a_end];
-    b_out.* = b[b_start..b_end];
-
-    return start;
-}
-
-/// A 2D array that is addressable as a[row, col]
-pub const Array2D = struct {
-    const Self = @This();
-
-    data: [*]usize,
-    allocator: std.mem.Allocator,
-    rows: usize,
-    cols: usize,
-
-    pub fn new(
-        allocator: std.mem.Allocator,
-        rows: usize,
-        cols: usize,
-    ) error{OutOfMemory}!Self {
-        const data = try allocator.alloc(usize, rows * cols);
-
-        return Self{
-            .data = data.ptr,
-            .allocator = allocator,
-            .rows = rows,
-            .cols = cols,
-        };
-    }
-
-    pub fn deinit(self: *Self) void {
-        self.allocator.free(self.data[0 .. self.rows * self.cols]);
-    }
-
-    pub fn get(self: *Self, row: usize, col: usize) *usize {
-        return @ptrCast(*usize, self.data + (row * self.cols) + col);
-    }
-};
-
-/// Build a Longest Common Subsequence table
-fn calculate_lcs(
-    lcs: *Array2D,
-    astr: []const u8,
-    bstr: []const u8,
-) void {
-    const rows = astr.len + 1;
-    const cols = bstr.len + 1;
-
-    std.mem.set(usize, lcs.data[0 .. rows * cols], 0);
-
-    // This approach is a dynamic programming technique to calculate the
-    // longest common subsequence between two strings, `a` and `b`. We start
-    // at 1 for `i` and `j` because the first column and first row are always
-    // set to zero
-    //
-    // You can find more information about this at the following url:
-    // https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
-    var i: usize = 1;
-    while (i < rows) : (i += 1) {
-        var j: usize = 1;
-        while (j < cols) : (j += 1) {
-            if (astr[i - 1] == bstr[j - 1]) {
-                lcs.get(i, j).* = lcs.get(i - 1, j - 1).* + 1;
-            } else {
-                lcs.get(i, j).* = std.math.max(
-                    lcs.get(i - 1, j).*,
-                    lcs.get(i, j - 1).*,
-                );
-            }
-        }
-    }
-}
-
-pub fn get_changes(
-    lcs: *Array2D,
-    a: []const u8,
-    a_trim_offset: usize,
-    a_trim: []const u8,
-    b_trim: []const u8,
-    allocator: std.mem.Allocator,
-) Error!std.ArrayListUnmanaged(types.TextEdit) {
-    // First we get a list of changes between strings at the character level:
-    // "addition", "deletion", and "no change" for each character
-    var changes = try std.ArrayListUnmanaged(Change).initCapacity(allocator, a_trim.len);
-    defer changes.deinit(allocator);
-    try recur_changes(
-        lcs,
-        &changes,
-        a_trim,
-        b_trim,
-        @intCast(i64, a_trim.len),
-        @intCast(i64, b_trim.len),
-        allocator,
-    );
-
-    // We want to group runs of deletions and additions, and separate them by
-    // runs of `.Nothing` changes. This will allow us to calculate the
-    // `TextEdit` ranges
-    var groups = std.ArrayListUnmanaged([]Change){};
-    defer groups.deinit(allocator);
-
-    var active_change: ?[]Change = null;
-    for (changes.items) |ch, i| {
-        switch (ch.operation) {
-            .Addition, .Deletion => {
-                if (active_change == null) {
-                    active_change = changes.items[i..];
-                }
+    var offset: usize = 0;
+    for (diffs.items) |diff| {
+        var start = offset;
+        switch (diff.operation) {
+            .delete => {
+                offset += diff.text.len;
+                try eds.append(allocator, .{ .range = offsets.locToRange(before, .{ .start = start, .end = offset }, encoding), .newText = "" });
             },
-            .Nothing => {
-                if (active_change) |*ac| {
-                    ac.* = ac.*[0..(i - (changes.items.len - ac.*.len))];
-                    try groups.append(allocator, ac.*);
-                    active_change = null;
-                }
+            .equal => {
+                offset += diff.text.len;
+            },
+            .insert => {
+                try eds.append(allocator, .{ .range = offsets.locToRange(before, .{ .start = start, .end = start }, encoding), .newText = diff.text });
             },
         }
     }
-    if (active_change) |*ac| {
-        ac.* = ac.*[0..(changes.items.len - (changes.items.len - ac.*.len))];
-        try groups.append(allocator, ac.*);
-    }
-
-    // The LCS algorithm works "in reverse", so we're putting everything back
-    // in ascending order
-    var a_lines = std.mem.split(u8, a, "\n");
-    std.mem.reverse([]Change, groups.items);
-    for (groups.items) |group| std.mem.reverse(Change, group);
-
-    var edit_results = try std.ArrayListUnmanaged(types.TextEdit).initCapacity(allocator, groups.items.len);
-    errdefer {
-        for (edit_results.items) |edit| {
-            allocator.free(edit.newText);
-        }
-        edit_results.deinit(allocator);
-    }
-
-    // Convert our grouped changes into `Edit`s
-    for (groups.items) |group| {
-        var range_start = group[0].pos;
-        var range_len: usize = 0;
-        var newText = std.ArrayListUnmanaged(u8){};
-        errdefer newText.deinit(allocator);
-        for (group) |ch| {
-            switch (ch.operation) {
-                .Addition => try newText.append(allocator, ch.value.?),
-                .Deletion => range_len += 1,
-                else => {},
-            }
-        }
-        var range = try char_pos_to_range(
-            &a_lines,
-            a_trim_offset + range_start,
-            a_trim_offset + range_start + range_len,
-        );
-        a_lines.reset();
-        edit_results.appendAssumeCapacity(.{
-            .range = range,
-            .newText = try newText.toOwnedSlice(allocator),
-        });
-    }
-
-    return edit_results;
+    return eds;
 }
 
-fn recur_changes(
-    lcs: *Array2D,
-    changes: *std.ArrayListUnmanaged(Change),
-    a: []const u8,
-    b: []const u8,
-    i: i64,
-    j: i64,
-    allocator: std.mem.Allocator,
-) error{OutOfMemory}!void {
-    // This function recursively works backwards through the LCS table in
-    // order to figure out what kind of changes took place to transform `a`
-    // into `b`
-
-    const ii = @intCast(usize, i);
-    const jj = @intCast(usize, j);
-
-    if (i > 0 and j > 0 and a[ii - 1] == b[jj - 1]) {
-        try changes.append(allocator, .{
-            .operation = .Nothing,
-            .pos = ii - 1,
-            .value = null,
-        });
-        try recur_changes(lcs, changes, a, b, i - 1, j - 1, allocator);
-    } else if (j > 0 and (i == 0 or lcs.get(ii, jj - 1).* >= lcs.get(ii - 1, jj).*)) {
-        try changes.append(allocator, .{
-            .operation = .Addition,
-            .pos = ii,
-            .value = b[jj - 1],
-        });
-        try recur_changes(lcs, changes, a, b, i, j - 1, allocator);
-    } else if (i > 0 and (j == 0 or lcs.get(ii, jj - 1).* < lcs.get(ii - 1, jj).*)) {
-        try changes.append(allocator, .{
-            .operation = .Deletion,
-            .pos = ii - 1,
-            .value = a[ii - 1],
-        });
-        try recur_changes(lcs, changes, a, b, i - 1, j, allocator);
-    }
-}
-
-/// Accept a range that is solely based on buffer/character position and
-/// convert it to line number & character position range
-fn char_pos_to_range(
-    lines: *std.mem.SplitIterator(u8),
-    start: usize,
-    end: usize,
-) Error!types.Range {
-    var char_pos: usize = 0;
-    var line_pos: usize = 0;
-    var result_start_pos: ?types.Position = null;
-    var result_end_pos: ?types.Position = null;
-
-    while (lines.next()) |line| : ({
-        char_pos += line.len + 1;
-        line_pos += 1;
-    }) {
-        if (start >= char_pos and start <= char_pos + line.len) {
-            result_start_pos = .{
-                .line = @intCast(u32, line_pos),
-                .character = @intCast(u32, start - char_pos),
-            };
-        }
-        if (end >= char_pos and end <= char_pos + line.len) {
-            result_end_pos = .{
-                .line = @intCast(u32, line_pos),
-                .character = @intCast(u32, end - char_pos),
-            };
-        }
-    }
-
-    if (result_start_pos == null) return error.InvalidRange;
-
-    // If we did not find an end position, it is outside the range of the
-    // string for some reason so clamp it to the string end position
-    if (result_end_pos == null) {
-        result_end_pos = types.Position{
-            .line = @intCast(u32, line_pos),
-            .character = @intCast(u32, char_pos),
-        };
-    }
-
-    return types.Range{
-        .start = result_start_pos.?,
-        .end = result_end_pos.?,
-    };
-}
-
-// Caller owns returned memory.
-pub fn applyTextEdits(
+/// Caller owns returned memory.
+/// NOTE: As far as I know, this implementation is actually incorrect
+/// as we use intermediate state, but at the same time, it works so
+/// I really don't want to touch it right now. TODO: Investigate + fix.
+pub fn applyContentChanges(
     allocator: std.mem.Allocator,
     text: []const u8,
     content_changes: []const types.TextDocumentContentChangeEvent,
@@ -385,3 +74,41 @@ pub fn applyTextEdits(
 
     return try text_array.toOwnedSliceSentinel(allocator, 0);
 }
+
+// https://cs.opensource.google/go/x/tools/+/master:internal/lsp/diff/diff.go;l=40
+
+fn textEditLessThan(_: void, lhs: types.TextEdit, rhs: types.TextEdit) bool {
+    return offsets.rangeLessThan(lhs.range, rhs.range);
+}
+
+/// Caller owns returned memory.
+pub fn applyTextEdits(
+    allocator: std.mem.Allocator,
+    text: []const u8,
+    text_edits: []const types.TextEdit,
+    encoding: offsets.Encoding,
+) ![]const u8 {
+    var text_edits_sortable = try allocator.alloc(types.TextEdit, text_edits.len);
+    defer allocator.free(text_edits_sortable);
+
+    std.mem.copy(types.TextEdit, text_edits_sortable, text_edits);
+    std.sort.sort(types.TextEdit, text_edits_sortable, {}, textEditLessThan);
+
+    var final_text = std.ArrayListUnmanaged(u8){};
+
+    var last: usize = 0;
+    for (text_edits_sortable) |te| {
+        const start = offsets.maybePositionToIndex(text, te.range.start, encoding) orelse text.len;
+        if (start > last) {
+            try final_text.appendSlice(allocator, text[last..start]);
+            last = start;
+        }
+        try final_text.appendSlice(allocator, te.newText);
+        last = offsets.maybePositionToIndex(text, te.range.end, encoding) orelse text.len;
+    }
+    if (last < text.len) {
+        try final_text.appendSlice(allocator, text[last..]);
+    }
+
+    return try final_text.toOwnedSlice(allocator);
+}
diff --git a/src/diffz b/src/diffz
new file mode 160000
index 0000000..c36d881
--- /dev/null
+++ b/src/diffz
@@ -0,0 +1 @@
+Subproject commit c36d8817926ca9c35d4584b6cb3c497982ae6e37
diff --git a/src/offsets.zig b/src/offsets.zig
index 38b8006..be68555 100644
--- a/src/offsets.zig
+++ b/src/offsets.zig
@@ -17,6 +17,25 @@ pub fn indexToPosition(text: []const u8, index: usize, encoding: Encoding) types
     };
 }
 
+pub fn maybePositionToIndex(text: []const u8, position: types.Position, encoding: Encoding) ?usize {
+    var line: u32 = 0;
+    var line_start_index: usize = 0;
+    for (text) |c, i| {
+        if (line == position.line) break;
+        if (c == '\n') {
+            line += 1;
+            line_start_index = i + 1;
+        }
+    }
+
+    if (line != position.line) return null;
+
+    const line_text = std.mem.sliceTo(text[line_start_index..], '\n');
+    const line_byte_length = getNCodeUnitByteCount(line_text, position.character, encoding);
+
+    return line_start_index + line_byte_length;
+}
+
 pub fn positionToIndex(text: []const u8, position: types.Position, encoding: Encoding) usize {
     var line: u32 = 0;
     var line_start_index: usize = 0;
@@ -333,3 +352,22 @@ pub fn getNCodeUnitByteCount(text: []const u8, n: usize, encoding: Encoding) usi
         },
     }
 }
+
+pub fn rangeLessThan(a: types.Range, b: types.Range) bool {
+    return positionLessThan(a.start, b.start) or positionLessThan(a.end, b.end);
+}
+
+pub fn positionLessThan(a: types.Position, b: types.Position) bool {
+    if (a.line < b.line) {
+        return true;
+    }
+    if (a.line > b.line) {
+        return false;
+    }
+
+    if (a.character < b.character) {
+        return true;
+    }
+
+    return false;
+}
diff --git a/src/zls.zig b/src/zls.zig
index cdd9b8a..2779184 100644
--- a/src/zls.zig
+++ b/src/zls.zig
@@ -13,3 +13,4 @@ pub const types = @import("lsp.zig");
 pub const URI = @import("uri.zig");
 pub const DocumentStore = @import("DocumentStore.zig");
 pub const ComptimeInterpreter = @import("ComptimeInterpreter.zig");
+pub const diff = @import("diff.zig");
diff --git a/tests/tests.zig b/tests/tests.zig
index b7719af..ebe7bce 100644
--- a/tests/tests.zig
+++ b/tests/tests.zig
@@ -5,6 +5,7 @@ comptime {
     _ = @import("utility/offsets.zig");
     _ = @import("utility/position_context.zig");
     _ = @import("utility/uri.zig");
+    _ = @import("utility/diff.zig");
 
     // TODO Lifecycle Messages
 
diff --git a/tests/utility/diff.zig b/tests/utility/diff.zig
new file mode 100644
index 0000000..0c4ae54
--- /dev/null
+++ b/tests/utility/diff.zig
@@ -0,0 +1,30 @@
+const std = @import("std");
+const zls = @import("zls");
+
+const allocator = std.testing.allocator;
+
+fn gen(alloc: std.mem.Allocator, rand: std.rand.Random) ![]const u8 {
+    var buffer = try alloc.alloc(u8, rand.intRangeAtMost(usize, 16, 1024));
+    for (buffer) |*b| b.* = rand.intRangeAtMost(u8, 32, 126);
+    return buffer;
+}
+
+test "diff - random" {
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    var rand = std.rand.DefaultPrng.init(0);
+
+    var index: usize = 0;
+
+    while (index < 100) : (index += 1) {
+        defer _ = arena.reset(.retain_capacity);
+
+        const pre = try gen(arena.allocator(), rand.random());
+        const post = try gen(arena.allocator(), rand.random());
+
+        var edits = try zls.diff.edits(arena.allocator(), pre, post, .@"utf-8");
+        const applied = try zls.diff.applyTextEdits(arena.allocator(), pre, edits.items, .@"utf-8");
+        try std.testing.expectEqualStrings(post, applied);
+    }
+}