Merge branch 'performance-improvements' into 'trunk'

optimised implementation of html_encoder See merge request Nicd/glentities!1
2024-05-23 04:52:53 +00:00 · 2024-05-23 04:52:53 +00:00 · 8431bfedd7
commit 8431bfedd7
parent aea7e60858 35eeb078bc
8 changed files with 343 additions and 39 deletions
--- a/src/ffi_js.mjs
+++ b/src/ffi_js.mjs
@ -1,3 +0,0 @@
 export function normalize(str, mode) {
  return str.normalize(mode);
 }
--- a/src/glentities/html_encoder.gleam
+++ b/src/glentities/html_encoder.gleam
@ -1,33 +1,15 @@
-import gleam/string
+@target(erlang)
-import gleam/string_builder.{type StringBuilder}
+import glentities/internal/html_encoder/erl as escaper
-import glentities/internal/string_utils
+
@target(javascript)
 import glentities/internal/html_encoder/generic as escaper
 /// Encode text to be safe in the HTML body, inside element or attribute content.
 ///
 /// `&`, `<`, `>`, `'`, and `"` are encoded.
 ///
 /// Note! Not suitable for outputting inside `<style>`, `<script>` elements.
 ///
 pub fn encode(text: String) -> String {
-  text
+  escaper.escape(text)
  |> string_utils.normalise()
  |> do_encode(string_builder.new())
 }
 fn do_encode(text: String, acc: StringBuilder) {
  case text {
    "" -> string_builder.to_string(acc)
    "&" <> rest -> do_encode(rest, string_builder.append(acc, "&amp;"))
    "<" <> rest -> do_encode(rest, string_builder.append(acc, "&lt;"))
    ">" <> rest -> do_encode(rest, string_builder.append(acc, "&gt;"))
    "\"" <> rest -> do_encode(rest, string_builder.append(acc, "&quot;"))
    "'" <> rest -> do_encode(rest, string_builder.append(acc, "&#39;"))
    other -> {
      let maybe_grapheme = string.pop_grapheme(other)
      case maybe_grapheme {
        Ok(#(grapheme, rest)) ->
          do_encode(rest, string_builder.append(acc, grapheme))
        Error(Nil) -> string_builder.to_string(acc)
      }
    }
  }
 }
--- a/src/glentities/internal/html_encoder/erl.gleam
+++ b/src/glentities/internal/html_encoder/erl.gleam
@ -0,0 +1,158 @@
@target(erlang)
 import gleam/bit_array
@target(erlang)
 import gleam/list
@target(erlang)
 pub fn escape(text: String) -> String {
  // This version is highly optimised for the Erlang target, it treats Strings
  // as BitArrays and slices them to share as much as possible. You can find
  // more details in `do_escape`.
  let bits = <<text:utf8>>
  let acc = do_escape(bits, 0, bits, [])
  list.reverse(acc)
  |> bit_array.concat
  |> coerce
 }
@target(erlang)
@external(erlang, "glentities_ffi", "coerce")
 fn coerce(bit_array: BitArray) -> String
 // A possible way to escape chars would be to split the string into graphemes,
 // traverse those one by one and accumulate them back into a string escaping
 // ">", "<", etc. as we see them.
 //
 // (For now this works just on the Erlang side)
 // However, we can be a lot more performant by working directly on the
 // `BitArray` representing a Gleam UTF-8 String.
 // This means that, instead of popping a grapheme at a time, we can work
 // directly on BitArray slices: this has the big advantage of making sure we
 // share as much as possible with the original string without having to build
 // a new one from scratch.
 //
@target(erlang)
 fn do_escape(
  bin: BitArray,
  skip: Int,
  original: BitArray,
  acc: List(BitArray),
 ) -> List(BitArray) {
  case bin {
    // If we find a char to escape we just advance the `skip` counter so that
    // it will be ignored in the following slice, then we append the escaped
    // version to the accumulator.
    <<"<":utf8, rest:bits>> -> {
      let acc = [<<"&lt;":utf8>>, ..acc]
      do_escape(rest, skip + 1, original, acc)
    }
    <<">":utf8, rest:bits>> -> {
      let acc = [<<"&gt;":utf8>>, ..acc]
      do_escape(rest, skip + 1, original, acc)
    }
    <<"&":utf8, rest:bits>> -> {
      let acc = [<<"&amp;":utf8>>, ..acc]
      do_escape(rest, skip + 1, original, acc)
    }
    <<"\"":utf8, rest:bits>> -> {
      let acc = [<<"&quot;":utf8>>, ..acc]
      do_escape(rest, skip + 1, original, acc)
    }
    <<"'":utf8, rest:bits>> -> {
      let acc = [<<"&#39;":utf8>>, ..acc]
      do_escape(rest, skip + 1, original, acc)
    }
    // For any other bit that doesn't need to be escaped we go into an inner
    // loop, consuming as much "non-escapable" chars as possible.
    <<_char, rest:bits>> -> do_escape_normal(rest, skip, original, acc, 1)
    <<>> -> acc
    // I think this might be a bug in exhaustiveness checking.
    _ -> panic as "non byte aligned string, all strings should be byte aligned"
  }
 }
@target(erlang)
 fn do_escape_normal(
  bin: BitArray,
  skip: Int,
  original: BitArray,
  acc: List(BitArray),
  len: Int,
 ) -> List(BitArray) {
  // Remember, if we're here it means we've found a char that doesn't need to be
  // escaped, so what we want to do is advance the `len` counter until we reach
  // a char that _does_ need to be escaped and take the slice going from
  // `skip` with size `len`.
  //
  // Imagine we're escaping this string: "abc<def&ghi" and we've reached 'd':
  // ```
  //    abc<def&ghi
  //       ^ `skip` points here
  // ```
  // We're going to be increasing `len` until we reach the '&':
  // ```
  //    abc<def&ghi
  //        ^^^ len will be 3 when we reach the '&' that needs escaping
  // ```
  // So we take the slice corresponding to "def".
  //
  case bin {
    // If we reach a char that has to be escaped we append the slice starting
    // from `skip` with size `len` and the escaped char.
    // This is what allows us to share as much of the original string as
    // possible: we only allocate a new BitArray for the escaped chars,
    // everything else is just a slice of the original String.
    <<"<":utf8, rest:bits>> -> {
      let assert Ok(slice) = bit_array.slice(original, skip, len)
      let acc = [<<"&lt;":utf8>>, slice, ..acc]
      do_escape(rest, skip + len + 1, original, acc)
    }
    <<">":utf8, rest:bits>> -> {
      let assert Ok(slice) = bit_array.slice(original, skip, len)
      let acc = [<<"&gt;":utf8>>, slice, ..acc]
      do_escape(rest, skip + len + 1, original, acc)
    }
    <<"&":utf8, rest:bits>> -> {
      let assert Ok(slice) = bit_array.slice(original, skip, len)
      let acc = [<<"&amp;":utf8>>, slice, ..acc]
      do_escape(rest, skip + len + 1, original, acc)
    }
    <<"\"":utf8, rest:bits>> -> {
      let assert Ok(slice) = bit_array.slice(original, skip, len)
      let acc = [<<"&quot;":utf8>>, slice, ..acc]
      do_escape(rest, skip + len + 1, original, acc)
    }
    <<"'":utf8, rest:bits>> -> {
      let assert Ok(slice) = bit_array.slice(original, skip, len)
      let acc = [<<"&#39;":utf8>>, slice, ..acc]
      do_escape(rest, skip + len + 1, original, acc)
    }
    // If a char doesn't need escaping we keep increasing the length of the
    // slice we're going to take.
    <<_char, rest:bits>> -> do_escape_normal(rest, skip, original, acc, len + 1)
    <<>> ->
      case skip {
        0 -> [original]
        _ -> {
          let assert Ok(slice) = bit_array.slice(original, skip, len)
          [slice, ..acc]
        }
      }
    _ -> panic as "non byte aligned string, all strings should be byte aligned"
  }
 }
--- a/src/glentities/internal/html_encoder/generic.gleam
+++ b/src/glentities/internal/html_encoder/generic.gleam
@ -0,0 +1,132 @@
 import gleam/list
 import gleam/string
 /// This `escape` function will work on all targets, beware that the version
 /// specifically optimised for Erlang will be _way faster_ than this one when
 /// running on the BEAM. That's why this fallback implementation is only ever
 /// used when running on the JS backend.
 ///
 pub fn escape(text: String) -> String {
  do_escape(text, 0, text, [], 0, False)
  |> list.reverse
  |> string.join(with: "")
 }
 // The logic behind this function is exactly the same as the erlang one: we
 // iterate the string byte by byte and only ever take slices of it (constant
 // time operation that ensures maximum sharing). However, this implementation is
 // a little more convoluted since we cannot define it as two mutually recursive
 // functions as we did with the Erlang one (or it won't be tail call optimised
 // on the JS target).
 fn do_escape(
  string: String,
  skip: Int,
  original: String,
  acc: List(String),
  len: Int,
  found_normal: Bool,
 ) -> List(String) {
  case found_normal, first(string) {
    False, "<" -> {
      let rest = drop_first(string)
      let acc = ["&lt;", ..acc]
      do_escape(rest, skip + 1, original, acc, 0, False)
    }
    False, ">" -> {
      let rest = drop_first(string)
      let acc = ["&gt;", ..acc]
      do_escape(rest, skip + 1, original, acc, 0, False)
    }
    False, "&" -> {
      let rest = drop_first(string)
      let acc = ["&amp;", ..acc]
      do_escape(rest, skip + 1, original, acc, 0, False)
    }
    False, "\"" -> {
      let rest = drop_first(string)
      let acc = ["&quot;", ..acc]
      do_escape(rest, skip + 1, original, acc, 0, False)
    }
    False, "'" -> {
      let rest = drop_first(string)
      let acc = ["&#39;", ..acc]
      do_escape(rest, skip + 1, original, acc, 0, False)
    }
    False, "" -> acc
    // For any other bit that doesn't need to be escaped we go into an inner
    // loop, consuming as much "non-escapable" chars as possible.
    False, _ -> {
      let rest = drop_first(string)
      do_escape(rest, skip, original, acc, 1, True)
    }
    True, "<" -> {
      let rest = drop_first(string)
      let slice = slice(original, skip, len)
      let acc = ["&lt;", slice, ..acc]
      do_escape(rest, skip + len + 1, original, acc, 0, False)
    }
    True, ">" -> {
      let rest = drop_first(string)
      let slice = slice(original, skip, len)
      let acc = ["&gt;", slice, ..acc]
      do_escape(rest, skip + len + 1, original, acc, 0, False)
    }
    True, "&" -> {
      let rest = drop_first(string)
      let slice = slice(original, skip, len)
      let acc = ["&amp;", slice, ..acc]
      do_escape(rest, skip + len + 1, original, acc, 0, False)
    }
    True, "\"" -> {
      let rest = drop_first(string)
      let slice = slice(original, skip, len)
      let acc = ["&quot;", slice, ..acc]
      do_escape(rest, skip + len + 1, original, acc, 0, False)
    }
    True, "'" -> {
      let rest = drop_first(string)
      let slice = slice(original, skip, len)
      let acc = ["&#39;", slice, ..acc]
      do_escape(rest, skip + len + 1, original, acc, 0, False)
    }
    True, "" ->
      case skip {
        0 -> [original]
        _ -> {
          let slice = slice(original, skip, len)
          [slice, ..acc]
        }
      }
    // If a char doesn't need escaping we keep increasing the length of the
    // slice we're going to take.
    True, _ -> {
      let rest = drop_first(string)
      do_escape(rest, skip, original, acc, len + 1, True)
    }
  }
 }
@external(erlang, "glentities_ffi", "first")
@external(javascript, "../../../glentities_ffi.mjs", "first")
 fn first(_string: String) -> String
@external(erlang, "glentities_ffi", "drop_first")
@external(javascript, "../../../glentities_ffi.mjs", "drop_first")
 fn drop_first(_string: String) -> String
@external(erlang, "glentities_ffi", "slice")
@external(javascript, "../../../glentities_ffi.mjs", "slice")
 fn slice(_string: String, _from: Int, _to: Int) -> String
--- a/src/glentities/internal/string_utils.gleam
+++ b/src/glentities/internal/string_utils.gleam
@ -8,5 +8,5 @@ pub fn normalise(text: String) -> String {
 }
@target(javascript)
-@external(javascript, "../../ffi_js.mjs", "normalize")
+@external(javascript, "../../glentities_ffi.mjs", "normalize")
 pub fn normalise_js(text text: String, mode mode: String) -> String
--- a/src/glentities_ffi.erl
+++ b/src/glentities_ffi.erl
@ -0,0 +1,20 @@
 -module(glentities_ffi).
 -export([coerce/1, slice/3, first/1, drop_first/1]).
 coerce(X) -> X.
 slice(String, From, Len) ->
        binary:part(String, From, Len).
 first(String) ->
    case String of
        <<>> -> <<>>;
        <<First, _/bitstring>> ->
            <<First>>
    end.
 drop_first(String) ->
    case String of
        <<>> -> <<>>;
        <<_, Rest/bitstring>> -> Rest
    end.
--- a/src/glentities_ffi.mjs
+++ b/src/glentities_ffi.mjs
@ -0,0 +1,15 @@
 export function normalize(str, mode) {
  return str.normalize(mode);
 }
 export function slice(string, from, len) {
  return string.slice(from, from + len);
 }
 export function first(string) {
  return string.slice(0, 1);
 }
 export function drop_first(string) {
  return string.slice(1);
 }
--- a/test/glentities_test.gleam
+++ b/test/glentities_test.gleam
@ -56,8 +56,8 @@ pub fn roundtrip_named_test() {
  should.equal(
    input
-    |> glentities.encode(glentities.Named)
+      |> glentities.encode(glentities.Named)
-    |> glentities.decode(),
+      |> glentities.decode(),
    input,
  )
 }
@ -70,8 +70,8 @@ pub fn roundtrip_hex_test() {
  should.equal(
    input
-    |> glentities.encode(glentities.Hex)
+      |> glentities.encode(glentities.Hex)
-    |> glentities.decode(),
+      |> glentities.decode(),
    input,
  )
 }
@ -85,22 +85,22 @@ pub fn tco_test() {
  should.equal(
    input
-    |> glentities.encode(glentities.Hex)
+      |> glentities.encode(glentities.Hex)
-    |> glentities.decode(),
+      |> glentities.decode(),
    input,
  )
  should.equal(
    input
-    |> glentities.encode(glentities.Named)
+      |> glentities.encode(glentities.Named)
-    |> glentities.decode(),
+      |> glentities.decode(),
    input,
  )
  should.equal(
    input
-    |> glentities.encode(glentities.HTMLBody)
+      |> glentities.encode(glentities.HTMLBody)
-    |> glentities.decode(),
+      |> glentities.decode(),
    input,
  )
 }