optimised implementation of html_encoder

This commit is contained in:
Giacomo Cavalieri 2024-05-10 11:10:22 +02:00
parent aea7e60858
commit 35eeb078bc
No known key found for this signature in database
GPG key ID: 4A196008E732F17E
8 changed files with 343 additions and 39 deletions

View file

@ -1,3 +0,0 @@
export function normalize(str, mode) {
return str.normalize(mode);
}

View file

@ -1,33 +1,15 @@
import gleam/string
import gleam/string_builder.{type StringBuilder}
import glentities/internal/string_utils
@target(erlang)
import glentities/internal/html_encoder/erl as escaper
@target(javascript)
import glentities/internal/html_encoder/generic as escaper
/// Encode text to be safe in the HTML body, inside element or attribute content.
///
/// `&`, `<`, `>`, `'`, and `"` are encoded.
///
/// Note! Not suitable for outputting inside `<style>`, `<script>` elements.
///
pub fn encode(text: String) -> String {
text
|> string_utils.normalise()
|> do_encode(string_builder.new())
}
fn do_encode(text: String, acc: StringBuilder) {
case text {
"" -> string_builder.to_string(acc)
"&" <> rest -> do_encode(rest, string_builder.append(acc, "&amp;"))
"<" <> rest -> do_encode(rest, string_builder.append(acc, "&lt;"))
">" <> rest -> do_encode(rest, string_builder.append(acc, "&gt;"))
"\"" <> rest -> do_encode(rest, string_builder.append(acc, "&quot;"))
"'" <> rest -> do_encode(rest, string_builder.append(acc, "&#39;"))
other -> {
let maybe_grapheme = string.pop_grapheme(other)
case maybe_grapheme {
Ok(#(grapheme, rest)) ->
do_encode(rest, string_builder.append(acc, grapheme))
Error(Nil) -> string_builder.to_string(acc)
}
}
}
escaper.escape(text)
}

View file

@ -0,0 +1,158 @@
@target(erlang)
import gleam/bit_array
@target(erlang)
import gleam/list
@target(erlang)
pub fn escape(text: String) -> String {
// This version is highly optimised for the Erlang target, it treats Strings
// as BitArrays and slices them to share as much as possible. You can find
// more details in `do_escape`.
let bits = <<text:utf8>>
let acc = do_escape(bits, 0, bits, [])
list.reverse(acc)
|> bit_array.concat
|> coerce
}
@target(erlang)
@external(erlang, "glentities_ffi", "coerce")
fn coerce(bit_array: BitArray) -> String
// A possible way to escape chars would be to split the string into graphemes,
// traverse those one by one and accumulate them back into a string escaping
// ">", "<", etc. as we see them.
//
// (For now this works just on the Erlang side)
// However, we can be a lot more performant by working directly on the
// `BitArray` representing a Gleam UTF-8 String.
// This means that, instead of popping a grapheme at a time, we can work
// directly on BitArray slices: this has the big advantage of making sure we
// share as much as possible with the original string without having to build
// a new one from scratch.
//
@target(erlang)
fn do_escape(
bin: BitArray,
skip: Int,
original: BitArray,
acc: List(BitArray),
) -> List(BitArray) {
case bin {
// If we find a char to escape we just advance the `skip` counter so that
// it will be ignored in the following slice, then we append the escaped
// version to the accumulator.
<<"<":utf8, rest:bits>> -> {
let acc = [<<"&lt;":utf8>>, ..acc]
do_escape(rest, skip + 1, original, acc)
}
<<">":utf8, rest:bits>> -> {
let acc = [<<"&gt;":utf8>>, ..acc]
do_escape(rest, skip + 1, original, acc)
}
<<"&":utf8, rest:bits>> -> {
let acc = [<<"&amp;":utf8>>, ..acc]
do_escape(rest, skip + 1, original, acc)
}
<<"\"":utf8, rest:bits>> -> {
let acc = [<<"&quot;":utf8>>, ..acc]
do_escape(rest, skip + 1, original, acc)
}
<<"'":utf8, rest:bits>> -> {
let acc = [<<"&#39;":utf8>>, ..acc]
do_escape(rest, skip + 1, original, acc)
}
// For any other bit that doesn't need to be escaped we go into an inner
// loop, consuming as much "non-escapable" chars as possible.
<<_char, rest:bits>> -> do_escape_normal(rest, skip, original, acc, 1)
<<>> -> acc
// I think this might be a bug in exhaustiveness checking.
_ -> panic as "non byte aligned string, all strings should be byte aligned"
}
}
@target(erlang)
fn do_escape_normal(
bin: BitArray,
skip: Int,
original: BitArray,
acc: List(BitArray),
len: Int,
) -> List(BitArray) {
// Remember, if we're here it means we've found a char that doesn't need to be
// escaped, so what we want to do is advance the `len` counter until we reach
// a char that _does_ need to be escaped and take the slice going from
// `skip` with size `len`.
//
// Imagine we're escaping this string: "abc<def&ghi" and we've reached 'd':
// ```
// abc<def&ghi
// ^ `skip` points here
// ```
// We're going to be increasing `len` until we reach the '&':
// ```
// abc<def&ghi
// ^^^ len will be 3 when we reach the '&' that needs escaping
// ```
// So we take the slice corresponding to "def".
//
case bin {
// If we reach a char that has to be escaped we append the slice starting
// from `skip` with size `len` and the escaped char.
// This is what allows us to share as much of the original string as
// possible: we only allocate a new BitArray for the escaped chars,
// everything else is just a slice of the original String.
<<"<":utf8, rest:bits>> -> {
let assert Ok(slice) = bit_array.slice(original, skip, len)
let acc = [<<"&lt;":utf8>>, slice, ..acc]
do_escape(rest, skip + len + 1, original, acc)
}
<<">":utf8, rest:bits>> -> {
let assert Ok(slice) = bit_array.slice(original, skip, len)
let acc = [<<"&gt;":utf8>>, slice, ..acc]
do_escape(rest, skip + len + 1, original, acc)
}
<<"&":utf8, rest:bits>> -> {
let assert Ok(slice) = bit_array.slice(original, skip, len)
let acc = [<<"&amp;":utf8>>, slice, ..acc]
do_escape(rest, skip + len + 1, original, acc)
}
<<"\"":utf8, rest:bits>> -> {
let assert Ok(slice) = bit_array.slice(original, skip, len)
let acc = [<<"&quot;":utf8>>, slice, ..acc]
do_escape(rest, skip + len + 1, original, acc)
}
<<"'":utf8, rest:bits>> -> {
let assert Ok(slice) = bit_array.slice(original, skip, len)
let acc = [<<"&#39;":utf8>>, slice, ..acc]
do_escape(rest, skip + len + 1, original, acc)
}
// If a char doesn't need escaping we keep increasing the length of the
// slice we're going to take.
<<_char, rest:bits>> -> do_escape_normal(rest, skip, original, acc, len + 1)
<<>> ->
case skip {
0 -> [original]
_ -> {
let assert Ok(slice) = bit_array.slice(original, skip, len)
[slice, ..acc]
}
}
_ -> panic as "non byte aligned string, all strings should be byte aligned"
}
}

View file

@ -0,0 +1,132 @@
import gleam/list
import gleam/string
/// This `escape` function will work on all targets, beware that the version
/// specifically optimised for Erlang will be _way faster_ than this one when
/// running on the BEAM. That's why this fallback implementation is only ever
/// used when running on the JS backend.
///
pub fn escape(text: String) -> String {
do_escape(text, 0, text, [], 0, False)
|> list.reverse
|> string.join(with: "")
}
// The logic behind this function is exactly the same as the erlang one: we
// iterate the string byte by byte and only ever take slices of it (constant
// time operation that ensures maximum sharing). However, this implementation is
// a little more convoluted since we cannot define it as two mutually recursive
// functions as we did with the Erlang one (or it won't be tail call optimised
// on the JS target).
fn do_escape(
string: String,
skip: Int,
original: String,
acc: List(String),
len: Int,
found_normal: Bool,
) -> List(String) {
case found_normal, first(string) {
False, "<" -> {
let rest = drop_first(string)
let acc = ["&lt;", ..acc]
do_escape(rest, skip + 1, original, acc, 0, False)
}
False, ">" -> {
let rest = drop_first(string)
let acc = ["&gt;", ..acc]
do_escape(rest, skip + 1, original, acc, 0, False)
}
False, "&" -> {
let rest = drop_first(string)
let acc = ["&amp;", ..acc]
do_escape(rest, skip + 1, original, acc, 0, False)
}
False, "\"" -> {
let rest = drop_first(string)
let acc = ["&quot;", ..acc]
do_escape(rest, skip + 1, original, acc, 0, False)
}
False, "'" -> {
let rest = drop_first(string)
let acc = ["&#39;", ..acc]
do_escape(rest, skip + 1, original, acc, 0, False)
}
False, "" -> acc
// For any other bit that doesn't need to be escaped we go into an inner
// loop, consuming as much "non-escapable" chars as possible.
False, _ -> {
let rest = drop_first(string)
do_escape(rest, skip, original, acc, 1, True)
}
True, "<" -> {
let rest = drop_first(string)
let slice = slice(original, skip, len)
let acc = ["&lt;", slice, ..acc]
do_escape(rest, skip + len + 1, original, acc, 0, False)
}
True, ">" -> {
let rest = drop_first(string)
let slice = slice(original, skip, len)
let acc = ["&gt;", slice, ..acc]
do_escape(rest, skip + len + 1, original, acc, 0, False)
}
True, "&" -> {
let rest = drop_first(string)
let slice = slice(original, skip, len)
let acc = ["&amp;", slice, ..acc]
do_escape(rest, skip + len + 1, original, acc, 0, False)
}
True, "\"" -> {
let rest = drop_first(string)
let slice = slice(original, skip, len)
let acc = ["&quot;", slice, ..acc]
do_escape(rest, skip + len + 1, original, acc, 0, False)
}
True, "'" -> {
let rest = drop_first(string)
let slice = slice(original, skip, len)
let acc = ["&#39;", slice, ..acc]
do_escape(rest, skip + len + 1, original, acc, 0, False)
}
True, "" ->
case skip {
0 -> [original]
_ -> {
let slice = slice(original, skip, len)
[slice, ..acc]
}
}
// If a char doesn't need escaping we keep increasing the length of the
// slice we're going to take.
True, _ -> {
let rest = drop_first(string)
do_escape(rest, skip, original, acc, len + 1, True)
}
}
}
@external(erlang, "glentities_ffi", "first")
@external(javascript, "../../../glentities_ffi.mjs", "first")
fn first(_string: String) -> String
@external(erlang, "glentities_ffi", "drop_first")
@external(javascript, "../../../glentities_ffi.mjs", "drop_first")
fn drop_first(_string: String) -> String
@external(erlang, "glentities_ffi", "slice")
@external(javascript, "../../../glentities_ffi.mjs", "slice")
fn slice(_string: String, _from: Int, _to: Int) -> String

View file

@ -8,5 +8,5 @@ pub fn normalise(text: String) -> String {
}
@target(javascript)
@external(javascript, "../../ffi_js.mjs", "normalize")
@external(javascript, "../../glentities_ffi.mjs", "normalize")
pub fn normalise_js(text text: String, mode mode: String) -> String

20
src/glentities_ffi.erl Normal file
View file

@ -0,0 +1,20 @@
-module(glentities_ffi).
-export([coerce/1, slice/3, first/1, drop_first/1]).
coerce(X) -> X.
slice(String, From, Len) ->
binary:part(String, From, Len).
first(String) ->
case String of
<<>> -> <<>>;
<<First, _/bitstring>> ->
<<First>>
end.
drop_first(String) ->
case String of
<<>> -> <<>>;
<<_, Rest/bitstring>> -> Rest
end.

15
src/glentities_ffi.mjs Normal file
View file

@ -0,0 +1,15 @@
export function normalize(str, mode) {
return str.normalize(mode);
}
export function slice(string, from, len) {
return string.slice(from, from + len);
}
export function first(string) {
return string.slice(0, 1);
}
export function drop_first(string) {
return string.slice(1);
}