Merge branch 'performance-improvements' into 'trunk'
optimised implementation of html_encoder See merge request Nicd/glentities!1
This commit is contained in:
commit
8431bfedd7
8 changed files with 343 additions and 39 deletions
|
@ -1,3 +0,0 @@
|
|||
export function normalize(str, mode) {
|
||||
return str.normalize(mode);
|
||||
}
|
|
@ -1,33 +1,15 @@
|
|||
import gleam/string
|
||||
import gleam/string_builder.{type StringBuilder}
|
||||
import glentities/internal/string_utils
|
||||
@target(erlang)
|
||||
import glentities/internal/html_encoder/erl as escaper
|
||||
|
||||
@target(javascript)
|
||||
import glentities/internal/html_encoder/generic as escaper
|
||||
|
||||
/// Encode text to be safe in the HTML body, inside element or attribute content.
|
||||
///
|
||||
/// `&`, `<`, `>`, `'`, and `"` are encoded.
|
||||
///
|
||||
/// Note! Not suitable for outputting inside `<style>`, `<script>` elements.
|
||||
///
|
||||
pub fn encode(text: String) -> String {
|
||||
text
|
||||
|> string_utils.normalise()
|
||||
|> do_encode(string_builder.new())
|
||||
}
|
||||
|
||||
fn do_encode(text: String, acc: StringBuilder) {
|
||||
case text {
|
||||
"" -> string_builder.to_string(acc)
|
||||
"&" <> rest -> do_encode(rest, string_builder.append(acc, "&"))
|
||||
"<" <> rest -> do_encode(rest, string_builder.append(acc, "<"))
|
||||
">" <> rest -> do_encode(rest, string_builder.append(acc, ">"))
|
||||
"\"" <> rest -> do_encode(rest, string_builder.append(acc, """))
|
||||
"'" <> rest -> do_encode(rest, string_builder.append(acc, "'"))
|
||||
other -> {
|
||||
let maybe_grapheme = string.pop_grapheme(other)
|
||||
case maybe_grapheme {
|
||||
Ok(#(grapheme, rest)) ->
|
||||
do_encode(rest, string_builder.append(acc, grapheme))
|
||||
Error(Nil) -> string_builder.to_string(acc)
|
||||
}
|
||||
}
|
||||
}
|
||||
escaper.escape(text)
|
||||
}
|
||||
|
|
158
src/glentities/internal/html_encoder/erl.gleam
Normal file
158
src/glentities/internal/html_encoder/erl.gleam
Normal file
|
@ -0,0 +1,158 @@
|
|||
@target(erlang)
|
||||
import gleam/bit_array
|
||||
@target(erlang)
|
||||
import gleam/list
|
||||
|
||||
@target(erlang)
|
||||
pub fn escape(text: String) -> String {
|
||||
// This version is highly optimised for the Erlang target, it treats Strings
|
||||
// as BitArrays and slices them to share as much as possible. You can find
|
||||
// more details in `do_escape`.
|
||||
let bits = <<text:utf8>>
|
||||
let acc = do_escape(bits, 0, bits, [])
|
||||
|
||||
list.reverse(acc)
|
||||
|> bit_array.concat
|
||||
|> coerce
|
||||
}
|
||||
|
||||
@target(erlang)
|
||||
@external(erlang, "glentities_ffi", "coerce")
|
||||
fn coerce(bit_array: BitArray) -> String
|
||||
|
||||
// A possible way to escape chars would be to split the string into graphemes,
|
||||
// traverse those one by one and accumulate them back into a string escaping
|
||||
// ">", "<", etc. as we see them.
|
||||
//
|
||||
// (For now this works just on the Erlang side)
|
||||
// However, we can be a lot more performant by working directly on the
|
||||
// `BitArray` representing a Gleam UTF-8 String.
|
||||
// This means that, instead of popping a grapheme at a time, we can work
|
||||
// directly on BitArray slices: this has the big advantage of making sure we
|
||||
// share as much as possible with the original string without having to build
|
||||
// a new one from scratch.
|
||||
//
|
||||
@target(erlang)
|
||||
fn do_escape(
|
||||
bin: BitArray,
|
||||
skip: Int,
|
||||
original: BitArray,
|
||||
acc: List(BitArray),
|
||||
) -> List(BitArray) {
|
||||
case bin {
|
||||
// If we find a char to escape we just advance the `skip` counter so that
|
||||
// it will be ignored in the following slice, then we append the escaped
|
||||
// version to the accumulator.
|
||||
<<"<":utf8, rest:bits>> -> {
|
||||
let acc = [<<"<":utf8>>, ..acc]
|
||||
do_escape(rest, skip + 1, original, acc)
|
||||
}
|
||||
|
||||
<<">":utf8, rest:bits>> -> {
|
||||
let acc = [<<">":utf8>>, ..acc]
|
||||
do_escape(rest, skip + 1, original, acc)
|
||||
}
|
||||
|
||||
<<"&":utf8, rest:bits>> -> {
|
||||
let acc = [<<"&":utf8>>, ..acc]
|
||||
do_escape(rest, skip + 1, original, acc)
|
||||
}
|
||||
|
||||
<<"\"":utf8, rest:bits>> -> {
|
||||
let acc = [<<""":utf8>>, ..acc]
|
||||
do_escape(rest, skip + 1, original, acc)
|
||||
}
|
||||
|
||||
<<"'":utf8, rest:bits>> -> {
|
||||
let acc = [<<"'":utf8>>, ..acc]
|
||||
do_escape(rest, skip + 1, original, acc)
|
||||
}
|
||||
|
||||
// For any other bit that doesn't need to be escaped we go into an inner
|
||||
// loop, consuming as much "non-escapable" chars as possible.
|
||||
<<_char, rest:bits>> -> do_escape_normal(rest, skip, original, acc, 1)
|
||||
|
||||
<<>> -> acc
|
||||
|
||||
// I think this might be a bug in exhaustiveness checking.
|
||||
_ -> panic as "non byte aligned string, all strings should be byte aligned"
|
||||
}
|
||||
}
|
||||
|
||||
@target(erlang)
|
||||
fn do_escape_normal(
|
||||
bin: BitArray,
|
||||
skip: Int,
|
||||
original: BitArray,
|
||||
acc: List(BitArray),
|
||||
len: Int,
|
||||
) -> List(BitArray) {
|
||||
// Remember, if we're here it means we've found a char that doesn't need to be
|
||||
// escaped, so what we want to do is advance the `len` counter until we reach
|
||||
// a char that _does_ need to be escaped and take the slice going from
|
||||
// `skip` with size `len`.
|
||||
//
|
||||
// Imagine we're escaping this string: "abc<def&ghi" and we've reached 'd':
|
||||
// ```
|
||||
// abc<def&ghi
|
||||
// ^ `skip` points here
|
||||
// ```
|
||||
// We're going to be increasing `len` until we reach the '&':
|
||||
// ```
|
||||
// abc<def&ghi
|
||||
// ^^^ len will be 3 when we reach the '&' that needs escaping
|
||||
// ```
|
||||
// So we take the slice corresponding to "def".
|
||||
//
|
||||
case bin {
|
||||
// If we reach a char that has to be escaped we append the slice starting
|
||||
// from `skip` with size `len` and the escaped char.
|
||||
// This is what allows us to share as much of the original string as
|
||||
// possible: we only allocate a new BitArray for the escaped chars,
|
||||
// everything else is just a slice of the original String.
|
||||
<<"<":utf8, rest:bits>> -> {
|
||||
let assert Ok(slice) = bit_array.slice(original, skip, len)
|
||||
let acc = [<<"<":utf8>>, slice, ..acc]
|
||||
do_escape(rest, skip + len + 1, original, acc)
|
||||
}
|
||||
|
||||
<<">":utf8, rest:bits>> -> {
|
||||
let assert Ok(slice) = bit_array.slice(original, skip, len)
|
||||
let acc = [<<">":utf8>>, slice, ..acc]
|
||||
do_escape(rest, skip + len + 1, original, acc)
|
||||
}
|
||||
|
||||
<<"&":utf8, rest:bits>> -> {
|
||||
let assert Ok(slice) = bit_array.slice(original, skip, len)
|
||||
let acc = [<<"&":utf8>>, slice, ..acc]
|
||||
do_escape(rest, skip + len + 1, original, acc)
|
||||
}
|
||||
|
||||
<<"\"":utf8, rest:bits>> -> {
|
||||
let assert Ok(slice) = bit_array.slice(original, skip, len)
|
||||
let acc = [<<""":utf8>>, slice, ..acc]
|
||||
do_escape(rest, skip + len + 1, original, acc)
|
||||
}
|
||||
|
||||
<<"'":utf8, rest:bits>> -> {
|
||||
let assert Ok(slice) = bit_array.slice(original, skip, len)
|
||||
let acc = [<<"'":utf8>>, slice, ..acc]
|
||||
do_escape(rest, skip + len + 1, original, acc)
|
||||
}
|
||||
|
||||
// If a char doesn't need escaping we keep increasing the length of the
|
||||
// slice we're going to take.
|
||||
<<_char, rest:bits>> -> do_escape_normal(rest, skip, original, acc, len + 1)
|
||||
|
||||
<<>> ->
|
||||
case skip {
|
||||
0 -> [original]
|
||||
_ -> {
|
||||
let assert Ok(slice) = bit_array.slice(original, skip, len)
|
||||
[slice, ..acc]
|
||||
}
|
||||
}
|
||||
|
||||
_ -> panic as "non byte aligned string, all strings should be byte aligned"
|
||||
}
|
||||
}
|
132
src/glentities/internal/html_encoder/generic.gleam
Normal file
132
src/glentities/internal/html_encoder/generic.gleam
Normal file
|
@ -0,0 +1,132 @@
|
|||
import gleam/list
|
||||
import gleam/string
|
||||
|
||||
/// This `escape` function will work on all targets, beware that the version
|
||||
/// specifically optimised for Erlang will be _way faster_ than this one when
|
||||
/// running on the BEAM. That's why this fallback implementation is only ever
|
||||
/// used when running on the JS backend.
|
||||
///
|
||||
pub fn escape(text: String) -> String {
|
||||
do_escape(text, 0, text, [], 0, False)
|
||||
|> list.reverse
|
||||
|> string.join(with: "")
|
||||
}
|
||||
|
||||
// The logic behind this function is exactly the same as the erlang one: we
|
||||
// iterate the string byte by byte and only ever take slices of it (constant
|
||||
// time operation that ensures maximum sharing). However, this implementation is
|
||||
// a little more convoluted since we cannot define it as two mutually recursive
|
||||
// functions as we did with the Erlang one (or it won't be tail call optimised
|
||||
// on the JS target).
|
||||
fn do_escape(
|
||||
string: String,
|
||||
skip: Int,
|
||||
original: String,
|
||||
acc: List(String),
|
||||
len: Int,
|
||||
found_normal: Bool,
|
||||
) -> List(String) {
|
||||
case found_normal, first(string) {
|
||||
False, "<" -> {
|
||||
let rest = drop_first(string)
|
||||
let acc = ["<", ..acc]
|
||||
do_escape(rest, skip + 1, original, acc, 0, False)
|
||||
}
|
||||
|
||||
False, ">" -> {
|
||||
let rest = drop_first(string)
|
||||
let acc = [">", ..acc]
|
||||
do_escape(rest, skip + 1, original, acc, 0, False)
|
||||
}
|
||||
|
||||
False, "&" -> {
|
||||
let rest = drop_first(string)
|
||||
let acc = ["&", ..acc]
|
||||
do_escape(rest, skip + 1, original, acc, 0, False)
|
||||
}
|
||||
|
||||
False, "\"" -> {
|
||||
let rest = drop_first(string)
|
||||
let acc = [""", ..acc]
|
||||
do_escape(rest, skip + 1, original, acc, 0, False)
|
||||
}
|
||||
|
||||
False, "'" -> {
|
||||
let rest = drop_first(string)
|
||||
let acc = ["'", ..acc]
|
||||
do_escape(rest, skip + 1, original, acc, 0, False)
|
||||
}
|
||||
|
||||
False, "" -> acc
|
||||
|
||||
// For any other bit that doesn't need to be escaped we go into an inner
|
||||
// loop, consuming as much "non-escapable" chars as possible.
|
||||
False, _ -> {
|
||||
let rest = drop_first(string)
|
||||
do_escape(rest, skip, original, acc, 1, True)
|
||||
}
|
||||
|
||||
True, "<" -> {
|
||||
let rest = drop_first(string)
|
||||
let slice = slice(original, skip, len)
|
||||
let acc = ["<", slice, ..acc]
|
||||
do_escape(rest, skip + len + 1, original, acc, 0, False)
|
||||
}
|
||||
|
||||
True, ">" -> {
|
||||
let rest = drop_first(string)
|
||||
let slice = slice(original, skip, len)
|
||||
let acc = [">", slice, ..acc]
|
||||
do_escape(rest, skip + len + 1, original, acc, 0, False)
|
||||
}
|
||||
|
||||
True, "&" -> {
|
||||
let rest = drop_first(string)
|
||||
let slice = slice(original, skip, len)
|
||||
let acc = ["&", slice, ..acc]
|
||||
do_escape(rest, skip + len + 1, original, acc, 0, False)
|
||||
}
|
||||
|
||||
True, "\"" -> {
|
||||
let rest = drop_first(string)
|
||||
let slice = slice(original, skip, len)
|
||||
let acc = [""", slice, ..acc]
|
||||
do_escape(rest, skip + len + 1, original, acc, 0, False)
|
||||
}
|
||||
|
||||
True, "'" -> {
|
||||
let rest = drop_first(string)
|
||||
let slice = slice(original, skip, len)
|
||||
let acc = ["'", slice, ..acc]
|
||||
do_escape(rest, skip + len + 1, original, acc, 0, False)
|
||||
}
|
||||
|
||||
True, "" ->
|
||||
case skip {
|
||||
0 -> [original]
|
||||
_ -> {
|
||||
let slice = slice(original, skip, len)
|
||||
[slice, ..acc]
|
||||
}
|
||||
}
|
||||
|
||||
// If a char doesn't need escaping we keep increasing the length of the
|
||||
// slice we're going to take.
|
||||
True, _ -> {
|
||||
let rest = drop_first(string)
|
||||
do_escape(rest, skip, original, acc, len + 1, True)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@external(erlang, "glentities_ffi", "first")
|
||||
@external(javascript, "../../../glentities_ffi.mjs", "first")
|
||||
fn first(_string: String) -> String
|
||||
|
||||
@external(erlang, "glentities_ffi", "drop_first")
|
||||
@external(javascript, "../../../glentities_ffi.mjs", "drop_first")
|
||||
fn drop_first(_string: String) -> String
|
||||
|
||||
@external(erlang, "glentities_ffi", "slice")
|
||||
@external(javascript, "../../../glentities_ffi.mjs", "slice")
|
||||
fn slice(_string: String, _from: Int, _to: Int) -> String
|
|
@ -8,5 +8,5 @@ pub fn normalise(text: String) -> String {
|
|||
}
|
||||
|
||||
@target(javascript)
|
||||
@external(javascript, "../../ffi_js.mjs", "normalize")
|
||||
@external(javascript, "../../glentities_ffi.mjs", "normalize")
|
||||
pub fn normalise_js(text text: String, mode mode: String) -> String
|
||||
|
|
20
src/glentities_ffi.erl
Normal file
20
src/glentities_ffi.erl
Normal file
|
@ -0,0 +1,20 @@
|
|||
-module(glentities_ffi).
|
||||
-export([coerce/1, slice/3, first/1, drop_first/1]).
|
||||
|
||||
coerce(X) -> X.
|
||||
|
||||
slice(String, From, Len) ->
|
||||
binary:part(String, From, Len).
|
||||
|
||||
first(String) ->
|
||||
case String of
|
||||
<<>> -> <<>>;
|
||||
<<First, _/bitstring>> ->
|
||||
<<First>>
|
||||
end.
|
||||
|
||||
drop_first(String) ->
|
||||
case String of
|
||||
<<>> -> <<>>;
|
||||
<<_, Rest/bitstring>> -> Rest
|
||||
end.
|
15
src/glentities_ffi.mjs
Normal file
15
src/glentities_ffi.mjs
Normal file
|
@ -0,0 +1,15 @@
|
|||
export function normalize(str, mode) {
|
||||
return str.normalize(mode);
|
||||
}
|
||||
|
||||
export function slice(string, from, len) {
|
||||
return string.slice(from, from + len);
|
||||
}
|
||||
|
||||
export function first(string) {
|
||||
return string.slice(0, 1);
|
||||
}
|
||||
|
||||
export function drop_first(string) {
|
||||
return string.slice(1);
|
||||
}
|
Loading…
Reference in a new issue