Merge branch 'performance-improvements' into 'trunk'

optimised implementation of html_encoder See merge request Nicd/glentities!1
2024-05-23 04:52:53 +00:00 · 2024-05-23 04:52:53 +00:00 · 8431bfedd7
commit 8431bfedd7
parent aea7e60858 35eeb078bc
8 changed files with 343 additions and 39 deletions
--- a/src/ffi_js.mjs
+++ b/src/ffi_js.mjs
@ -1,3 +0,0 @@
-export function normalize(str, mode) {
-  return str.normalize(mode);
-}
--- a/src/glentities/html_encoder.gleam
+++ b/src/glentities/html_encoder.gleam
@ -1,33 +1,15 @@
-import gleam/string
-import gleam/string_builder.{type StringBuilder}
-import glentities/internal/string_utils
+@target(erlang)
+import glentities/internal/html_encoder/erl as escaper
+
+@target(javascript)
+import glentities/internal/html_encoder/generic as escaper

 /// Encode text to be safe in the HTML body, inside element or attribute content.
 ///
 /// `&`, `<`, `>`, `'`, and `"` are encoded.
 ///
 /// Note! Not suitable for outputting inside `<style>`, `<script>` elements.
+///
 pub fn encode(text: String) -> String {
-  text
-  |> string_utils.normalise()
-  |> do_encode(string_builder.new())
-}
-
-fn do_encode(text: String, acc: StringBuilder) {
-  case text {
-    "" -> string_builder.to_string(acc)
-    "&" <> rest -> do_encode(rest, string_builder.append(acc, "&amp;"))
-    "<" <> rest -> do_encode(rest, string_builder.append(acc, "&lt;"))
-    ">" <> rest -> do_encode(rest, string_builder.append(acc, "&gt;"))
-    "\"" <> rest -> do_encode(rest, string_builder.append(acc, "&quot;"))
-    "'" <> rest -> do_encode(rest, string_builder.append(acc, "&#39;"))
-    other -> {
-      let maybe_grapheme = string.pop_grapheme(other)
-      case maybe_grapheme {
-        Ok(#(grapheme, rest)) ->
-          do_encode(rest, string_builder.append(acc, grapheme))
-        Error(Nil) -> string_builder.to_string(acc)
-      }
-    }
-  }
+  escaper.escape(text)
 }
--- a/src/glentities/internal/html_encoder/erl.gleam
+++ b/src/glentities/internal/html_encoder/erl.gleam
@ -0,0 +1,158 @@
+@target(erlang)
+import gleam/bit_array
+@target(erlang)
+import gleam/list
+
+@target(erlang)
+pub fn escape(text: String) -> String {
+  // This version is highly optimised for the Erlang target, it treats Strings
+  // as BitArrays and slices them to share as much as possible. You can find
+  // more details in `do_escape`.
+  let bits = <<text:utf8>>
+  let acc = do_escape(bits, 0, bits, [])
+
+  list.reverse(acc)
+  |> bit_array.concat
+  |> coerce
+}
+
+@target(erlang)
+@external(erlang, "glentities_ffi", "coerce")
+fn coerce(bit_array: BitArray) -> String
+
+// A possible way to escape chars would be to split the string into graphemes,
+// traverse those one by one and accumulate them back into a string escaping
+// ">", "<", etc. as we see them.
+//
+// (For now this works just on the Erlang side)
+// However, we can be a lot more performant by working directly on the
+// `BitArray` representing a Gleam UTF-8 String.
+// This means that, instead of popping a grapheme at a time, we can work
+// directly on BitArray slices: this has the big advantage of making sure we
+// share as much as possible with the original string without having to build
+// a new one from scratch.
+//
+@target(erlang)
+fn do_escape(
+  bin: BitArray,
+  skip: Int,
+  original: BitArray,
+  acc: List(BitArray),
+) -> List(BitArray) {
+  case bin {
+    // If we find a char to escape we just advance the `skip` counter so that
+    // it will be ignored in the following slice, then we append the escaped
+    // version to the accumulator.
+    <<"<":utf8, rest:bits>> -> {
+      let acc = [<<"&lt;":utf8>>, ..acc]
+      do_escape(rest, skip + 1, original, acc)
+    }
+
+    <<">":utf8, rest:bits>> -> {
+      let acc = [<<"&gt;":utf8>>, ..acc]
+      do_escape(rest, skip + 1, original, acc)
+    }
+
+    <<"&":utf8, rest:bits>> -> {
+      let acc = [<<"&amp;":utf8>>, ..acc]
+      do_escape(rest, skip + 1, original, acc)
+    }
+
+    <<"\"":utf8, rest:bits>> -> {
+      let acc = [<<"&quot;":utf8>>, ..acc]
+      do_escape(rest, skip + 1, original, acc)
+    }
+
+    <<"'":utf8, rest:bits>> -> {
+      let acc = [<<"&#39;":utf8>>, ..acc]
+      do_escape(rest, skip + 1, original, acc)
+    }
+
+    // For any other bit that doesn't need to be escaped we go into an inner
+    // loop, consuming as much "non-escapable" chars as possible.
+    <<_char, rest:bits>> -> do_escape_normal(rest, skip, original, acc, 1)
+
+    <<>> -> acc
+
+    // I think this might be a bug in exhaustiveness checking.
+    _ -> panic as "non byte aligned string, all strings should be byte aligned"
+  }
+}
+
+@target(erlang)
+fn do_escape_normal(
+  bin: BitArray,
+  skip: Int,
+  original: BitArray,
+  acc: List(BitArray),
+  len: Int,
+) -> List(BitArray) {
+  // Remember, if we're here it means we've found a char that doesn't need to be
+  // escaped, so what we want to do is advance the `len` counter until we reach
+  // a char that _does_ need to be escaped and take the slice going from
+  // `skip` with size `len`.
+  //
+  // Imagine we're escaping this string: "abc<def&ghi" and we've reached 'd':
+  // ```
+  //    abc<def&ghi
+  //       ^ `skip` points here
+  // ```
+  // We're going to be increasing `len` until we reach the '&':
+  // ```
+  //    abc<def&ghi
+  //        ^^^ len will be 3 when we reach the '&' that needs escaping
+  // ```
+  // So we take the slice corresponding to "def".
+  //
+  case bin {
+    // If we reach a char that has to be escaped we append the slice starting
+    // from `skip` with size `len` and the escaped char.
+    // This is what allows us to share as much of the original string as
+    // possible: we only allocate a new BitArray for the escaped chars,
+    // everything else is just a slice of the original String.
+    <<"<":utf8, rest:bits>> -> {
+      let assert Ok(slice) = bit_array.slice(original, skip, len)
+      let acc = [<<"&lt;":utf8>>, slice, ..acc]
+      do_escape(rest, skip + len + 1, original, acc)
+    }
+
+    <<">":utf8, rest:bits>> -> {
+      let assert Ok(slice) = bit_array.slice(original, skip, len)
+      let acc = [<<"&gt;":utf8>>, slice, ..acc]
+      do_escape(rest, skip + len + 1, original, acc)
+    }
+
+    <<"&":utf8, rest:bits>> -> {
+      let assert Ok(slice) = bit_array.slice(original, skip, len)
+      let acc = [<<"&amp;":utf8>>, slice, ..acc]
+      do_escape(rest, skip + len + 1, original, acc)
+    }
+
+    <<"\"":utf8, rest:bits>> -> {
+      let assert Ok(slice) = bit_array.slice(original, skip, len)
+      let acc = [<<"&quot;":utf8>>, slice, ..acc]
+      do_escape(rest, skip + len + 1, original, acc)
+    }
+
+    <<"'":utf8, rest:bits>> -> {
+      let assert Ok(slice) = bit_array.slice(original, skip, len)
+      let acc = [<<"&#39;":utf8>>, slice, ..acc]
+      do_escape(rest, skip + len + 1, original, acc)
+    }
+
+    // If a char doesn't need escaping we keep increasing the length of the
+    // slice we're going to take.
+    <<_char, rest:bits>> -> do_escape_normal(rest, skip, original, acc, len + 1)
+
+    <<>> ->
+      case skip {
+        0 -> [original]
+        _ -> {
+          let assert Ok(slice) = bit_array.slice(original, skip, len)
+          [slice, ..acc]
+        }
+      }
+
+    _ -> panic as "non byte aligned string, all strings should be byte aligned"
+  }
+}
--- a/src/glentities/internal/html_encoder/generic.gleam
+++ b/src/glentities/internal/html_encoder/generic.gleam
@ -0,0 +1,132 @@
+import gleam/list
+import gleam/string
+
+/// This `escape` function will work on all targets, beware that the version
+/// specifically optimised for Erlang will be _way faster_ than this one when
+/// running on the BEAM. That's why this fallback implementation is only ever
+/// used when running on the JS backend.
+///
+pub fn escape(text: String) -> String {
+  do_escape(text, 0, text, [], 0, False)
+  |> list.reverse
+  |> string.join(with: "")
+}
+
+// The logic behind this function is exactly the same as the erlang one: we
+// iterate the string byte by byte and only ever take slices of it (constant
+// time operation that ensures maximum sharing). However, this implementation is
+// a little more convoluted since we cannot define it as two mutually recursive
+// functions as we did with the Erlang one (or it won't be tail call optimised
+// on the JS target).
+fn do_escape(
+  string: String,
+  skip: Int,
+  original: String,
+  acc: List(String),
+  len: Int,
+  found_normal: Bool,
+) -> List(String) {
+  case found_normal, first(string) {
+    False, "<" -> {
+      let rest = drop_first(string)
+      let acc = ["&lt;", ..acc]
+      do_escape(rest, skip + 1, original, acc, 0, False)
+    }
+
+    False, ">" -> {
+      let rest = drop_first(string)
+      let acc = ["&gt;", ..acc]
+      do_escape(rest, skip + 1, original, acc, 0, False)
+    }
+
+    False, "&" -> {
+      let rest = drop_first(string)
+      let acc = ["&amp;", ..acc]
+      do_escape(rest, skip + 1, original, acc, 0, False)
+    }
+
+    False, "\"" -> {
+      let rest = drop_first(string)
+      let acc = ["&quot;", ..acc]
+      do_escape(rest, skip + 1, original, acc, 0, False)
+    }
+
+    False, "'" -> {
+      let rest = drop_first(string)
+      let acc = ["&#39;", ..acc]
+      do_escape(rest, skip + 1, original, acc, 0, False)
+    }
+
+    False, "" -> acc
+
+    // For any other bit that doesn't need to be escaped we go into an inner
+    // loop, consuming as much "non-escapable" chars as possible.
+    False, _ -> {
+      let rest = drop_first(string)
+      do_escape(rest, skip, original, acc, 1, True)
+    }
+
+    True, "<" -> {
+      let rest = drop_first(string)
+      let slice = slice(original, skip, len)
+      let acc = ["&lt;", slice, ..acc]
+      do_escape(rest, skip + len + 1, original, acc, 0, False)
+    }
+
+    True, ">" -> {
+      let rest = drop_first(string)
+      let slice = slice(original, skip, len)
+      let acc = ["&gt;", slice, ..acc]
+      do_escape(rest, skip + len + 1, original, acc, 0, False)
+    }
+
+    True, "&" -> {
+      let rest = drop_first(string)
+      let slice = slice(original, skip, len)
+      let acc = ["&amp;", slice, ..acc]
+      do_escape(rest, skip + len + 1, original, acc, 0, False)
+    }
+
+    True, "\"" -> {
+      let rest = drop_first(string)
+      let slice = slice(original, skip, len)
+      let acc = ["&quot;", slice, ..acc]
+      do_escape(rest, skip + len + 1, original, acc, 0, False)
+    }
+
+    True, "'" -> {
+      let rest = drop_first(string)
+      let slice = slice(original, skip, len)
+      let acc = ["&#39;", slice, ..acc]
+      do_escape(rest, skip + len + 1, original, acc, 0, False)
+    }
+
+    True, "" ->
+      case skip {
+        0 -> [original]
+        _ -> {
+          let slice = slice(original, skip, len)
+          [slice, ..acc]
+        }
+      }
+
+    // If a char doesn't need escaping we keep increasing the length of the
+    // slice we're going to take.
+    True, _ -> {
+      let rest = drop_first(string)
+      do_escape(rest, skip, original, acc, len + 1, True)
+    }
+  }
+}
+
+@external(erlang, "glentities_ffi", "first")
+@external(javascript, "../../../glentities_ffi.mjs", "first")
+fn first(_string: String) -> String
+
+@external(erlang, "glentities_ffi", "drop_first")
+@external(javascript, "../../../glentities_ffi.mjs", "drop_first")
+fn drop_first(_string: String) -> String
+
+@external(erlang, "glentities_ffi", "slice")
+@external(javascript, "../../../glentities_ffi.mjs", "slice")
+fn slice(_string: String, _from: Int, _to: Int) -> String
--- a/src/glentities/internal/string_utils.gleam
+++ b/src/glentities/internal/string_utils.gleam
@ -8,5 +8,5 @@ pub fn normalise(text: String) -> String {
 }

@target(javascript)
-@external(javascript, "../../ffi_js.mjs", "normalize")
+@external(javascript, "../../glentities_ffi.mjs", "normalize")
 pub fn normalise_js(text text: String, mode mode: String) -> String
--- a/src/glentities_ffi.erl
+++ b/src/glentities_ffi.erl
@ -0,0 +1,20 @@
+-module(glentities_ffi).
+-export([coerce/1, slice/3, first/1, drop_first/1]).
+
+coerce(X) -> X.
+
+slice(String, From, Len) ->
+        binary:part(String, From, Len).
+
+first(String) ->
+    case String of
+        <<>> -> <<>>;
+        <<First, _/bitstring>> ->
+            <<First>>
+    end.
+
+drop_first(String) ->
+    case String of
+        <<>> -> <<>>;
+        <<_, Rest/bitstring>> -> Rest
+    end.
--- a/src/glentities_ffi.mjs
+++ b/src/glentities_ffi.mjs
@ -0,0 +1,15 @@
+export function normalize(str, mode) {
+  return str.normalize(mode);
+}
+
+export function slice(string, from, len) {
+  return string.slice(from, from + len);
+}
+
+export function first(string) {
+  return string.slice(0, 1);
+}
+
+export function drop_first(string) {
+  return string.slice(1);
+}