Merge branch 'performance-improvements' into 'trunk'
optimised implementation of html_encoder See merge request Nicd/glentities!1
This commit is contained in:
commit
8431bfedd7
8 changed files with 343 additions and 39 deletions
|
@ -1,3 +0,0 @@
|
||||||
export function normalize(str, mode) {
|
|
||||||
return str.normalize(mode);
|
|
||||||
}
|
|
|
@ -1,33 +1,15 @@
|
||||||
import gleam/string
|
@target(erlang)
|
||||||
import gleam/string_builder.{type StringBuilder}
|
import glentities/internal/html_encoder/erl as escaper
|
||||||
import glentities/internal/string_utils
|
|
||||||
|
@target(javascript)
|
||||||
|
import glentities/internal/html_encoder/generic as escaper
|
||||||
|
|
||||||
/// Encode text to be safe in the HTML body, inside element or attribute content.
|
/// Encode text to be safe in the HTML body, inside element or attribute content.
|
||||||
///
|
///
|
||||||
/// `&`, `<`, `>`, `'`, and `"` are encoded.
|
/// `&`, `<`, `>`, `'`, and `"` are encoded.
|
||||||
///
|
///
|
||||||
/// Note! Not suitable for outputting inside `<style>`, `<script>` elements.
|
/// Note! Not suitable for outputting inside `<style>`, `<script>` elements.
|
||||||
|
///
|
||||||
pub fn encode(text: String) -> String {
|
pub fn encode(text: String) -> String {
|
||||||
text
|
escaper.escape(text)
|
||||||
|> string_utils.normalise()
|
|
||||||
|> do_encode(string_builder.new())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn do_encode(text: String, acc: StringBuilder) {
|
|
||||||
case text {
|
|
||||||
"" -> string_builder.to_string(acc)
|
|
||||||
"&" <> rest -> do_encode(rest, string_builder.append(acc, "&"))
|
|
||||||
"<" <> rest -> do_encode(rest, string_builder.append(acc, "<"))
|
|
||||||
">" <> rest -> do_encode(rest, string_builder.append(acc, ">"))
|
|
||||||
"\"" <> rest -> do_encode(rest, string_builder.append(acc, """))
|
|
||||||
"'" <> rest -> do_encode(rest, string_builder.append(acc, "'"))
|
|
||||||
other -> {
|
|
||||||
let maybe_grapheme = string.pop_grapheme(other)
|
|
||||||
case maybe_grapheme {
|
|
||||||
Ok(#(grapheme, rest)) ->
|
|
||||||
do_encode(rest, string_builder.append(acc, grapheme))
|
|
||||||
Error(Nil) -> string_builder.to_string(acc)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
158
src/glentities/internal/html_encoder/erl.gleam
Normal file
158
src/glentities/internal/html_encoder/erl.gleam
Normal file
|
@ -0,0 +1,158 @@
|
||||||
|
@target(erlang)
|
||||||
|
import gleam/bit_array
|
||||||
|
@target(erlang)
|
||||||
|
import gleam/list
|
||||||
|
|
||||||
|
@target(erlang)
|
||||||
|
pub fn escape(text: String) -> String {
|
||||||
|
// This version is highly optimised for the Erlang target, it treats Strings
|
||||||
|
// as BitArrays and slices them to share as much as possible. You can find
|
||||||
|
// more details in `do_escape`.
|
||||||
|
let bits = <<text:utf8>>
|
||||||
|
let acc = do_escape(bits, 0, bits, [])
|
||||||
|
|
||||||
|
list.reverse(acc)
|
||||||
|
|> bit_array.concat
|
||||||
|
|> coerce
|
||||||
|
}
|
||||||
|
|
||||||
|
@target(erlang)
|
||||||
|
@external(erlang, "glentities_ffi", "coerce")
|
||||||
|
fn coerce(bit_array: BitArray) -> String
|
||||||
|
|
||||||
|
// A possible way to escape chars would be to split the string into graphemes,
|
||||||
|
// traverse those one by one and accumulate them back into a string escaping
|
||||||
|
// ">", "<", etc. as we see them.
|
||||||
|
//
|
||||||
|
// (For now this works just on the Erlang side)
|
||||||
|
// However, we can be a lot more performant by working directly on the
|
||||||
|
// `BitArray` representing a Gleam UTF-8 String.
|
||||||
|
// This means that, instead of popping a grapheme at a time, we can work
|
||||||
|
// directly on BitArray slices: this has the big advantage of making sure we
|
||||||
|
// share as much as possible with the original string without having to build
|
||||||
|
// a new one from scratch.
|
||||||
|
//
|
||||||
|
@target(erlang)
|
||||||
|
fn do_escape(
|
||||||
|
bin: BitArray,
|
||||||
|
skip: Int,
|
||||||
|
original: BitArray,
|
||||||
|
acc: List(BitArray),
|
||||||
|
) -> List(BitArray) {
|
||||||
|
case bin {
|
||||||
|
// If we find a char to escape we just advance the `skip` counter so that
|
||||||
|
// it will be ignored in the following slice, then we append the escaped
|
||||||
|
// version to the accumulator.
|
||||||
|
<<"<":utf8, rest:bits>> -> {
|
||||||
|
let acc = [<<"<":utf8>>, ..acc]
|
||||||
|
do_escape(rest, skip + 1, original, acc)
|
||||||
|
}
|
||||||
|
|
||||||
|
<<">":utf8, rest:bits>> -> {
|
||||||
|
let acc = [<<">":utf8>>, ..acc]
|
||||||
|
do_escape(rest, skip + 1, original, acc)
|
||||||
|
}
|
||||||
|
|
||||||
|
<<"&":utf8, rest:bits>> -> {
|
||||||
|
let acc = [<<"&":utf8>>, ..acc]
|
||||||
|
do_escape(rest, skip + 1, original, acc)
|
||||||
|
}
|
||||||
|
|
||||||
|
<<"\"":utf8, rest:bits>> -> {
|
||||||
|
let acc = [<<""":utf8>>, ..acc]
|
||||||
|
do_escape(rest, skip + 1, original, acc)
|
||||||
|
}
|
||||||
|
|
||||||
|
<<"'":utf8, rest:bits>> -> {
|
||||||
|
let acc = [<<"'":utf8>>, ..acc]
|
||||||
|
do_escape(rest, skip + 1, original, acc)
|
||||||
|
}
|
||||||
|
|
||||||
|
// For any other bit that doesn't need to be escaped we go into an inner
|
||||||
|
// loop, consuming as much "non-escapable" chars as possible.
|
||||||
|
<<_char, rest:bits>> -> do_escape_normal(rest, skip, original, acc, 1)
|
||||||
|
|
||||||
|
<<>> -> acc
|
||||||
|
|
||||||
|
// I think this might be a bug in exhaustiveness checking.
|
||||||
|
_ -> panic as "non byte aligned string, all strings should be byte aligned"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@target(erlang)
|
||||||
|
fn do_escape_normal(
|
||||||
|
bin: BitArray,
|
||||||
|
skip: Int,
|
||||||
|
original: BitArray,
|
||||||
|
acc: List(BitArray),
|
||||||
|
len: Int,
|
||||||
|
) -> List(BitArray) {
|
||||||
|
// Remember, if we're here it means we've found a char that doesn't need to be
|
||||||
|
// escaped, so what we want to do is advance the `len` counter until we reach
|
||||||
|
// a char that _does_ need to be escaped and take the slice going from
|
||||||
|
// `skip` with size `len`.
|
||||||
|
//
|
||||||
|
// Imagine we're escaping this string: "abc<def&ghi" and we've reached 'd':
|
||||||
|
// ```
|
||||||
|
// abc<def&ghi
|
||||||
|
// ^ `skip` points here
|
||||||
|
// ```
|
||||||
|
// We're going to be increasing `len` until we reach the '&':
|
||||||
|
// ```
|
||||||
|
// abc<def&ghi
|
||||||
|
// ^^^ len will be 3 when we reach the '&' that needs escaping
|
||||||
|
// ```
|
||||||
|
// So we take the slice corresponding to "def".
|
||||||
|
//
|
||||||
|
case bin {
|
||||||
|
// If we reach a char that has to be escaped we append the slice starting
|
||||||
|
// from `skip` with size `len` and the escaped char.
|
||||||
|
// This is what allows us to share as much of the original string as
|
||||||
|
// possible: we only allocate a new BitArray for the escaped chars,
|
||||||
|
// everything else is just a slice of the original String.
|
||||||
|
<<"<":utf8, rest:bits>> -> {
|
||||||
|
let assert Ok(slice) = bit_array.slice(original, skip, len)
|
||||||
|
let acc = [<<"<":utf8>>, slice, ..acc]
|
||||||
|
do_escape(rest, skip + len + 1, original, acc)
|
||||||
|
}
|
||||||
|
|
||||||
|
<<">":utf8, rest:bits>> -> {
|
||||||
|
let assert Ok(slice) = bit_array.slice(original, skip, len)
|
||||||
|
let acc = [<<">":utf8>>, slice, ..acc]
|
||||||
|
do_escape(rest, skip + len + 1, original, acc)
|
||||||
|
}
|
||||||
|
|
||||||
|
<<"&":utf8, rest:bits>> -> {
|
||||||
|
let assert Ok(slice) = bit_array.slice(original, skip, len)
|
||||||
|
let acc = [<<"&":utf8>>, slice, ..acc]
|
||||||
|
do_escape(rest, skip + len + 1, original, acc)
|
||||||
|
}
|
||||||
|
|
||||||
|
<<"\"":utf8, rest:bits>> -> {
|
||||||
|
let assert Ok(slice) = bit_array.slice(original, skip, len)
|
||||||
|
let acc = [<<""":utf8>>, slice, ..acc]
|
||||||
|
do_escape(rest, skip + len + 1, original, acc)
|
||||||
|
}
|
||||||
|
|
||||||
|
<<"'":utf8, rest:bits>> -> {
|
||||||
|
let assert Ok(slice) = bit_array.slice(original, skip, len)
|
||||||
|
let acc = [<<"'":utf8>>, slice, ..acc]
|
||||||
|
do_escape(rest, skip + len + 1, original, acc)
|
||||||
|
}
|
||||||
|
|
||||||
|
// If a char doesn't need escaping we keep increasing the length of the
|
||||||
|
// slice we're going to take.
|
||||||
|
<<_char, rest:bits>> -> do_escape_normal(rest, skip, original, acc, len + 1)
|
||||||
|
|
||||||
|
<<>> ->
|
||||||
|
case skip {
|
||||||
|
0 -> [original]
|
||||||
|
_ -> {
|
||||||
|
let assert Ok(slice) = bit_array.slice(original, skip, len)
|
||||||
|
[slice, ..acc]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_ -> panic as "non byte aligned string, all strings should be byte aligned"
|
||||||
|
}
|
||||||
|
}
|
132
src/glentities/internal/html_encoder/generic.gleam
Normal file
132
src/glentities/internal/html_encoder/generic.gleam
Normal file
|
@ -0,0 +1,132 @@
|
||||||
|
import gleam/list
|
||||||
|
import gleam/string
|
||||||
|
|
||||||
|
/// This `escape` function will work on all targets, beware that the version
|
||||||
|
/// specifically optimised for Erlang will be _way faster_ than this one when
|
||||||
|
/// running on the BEAM. That's why this fallback implementation is only ever
|
||||||
|
/// used when running on the JS backend.
|
||||||
|
///
|
||||||
|
pub fn escape(text: String) -> String {
|
||||||
|
do_escape(text, 0, text, [], 0, False)
|
||||||
|
|> list.reverse
|
||||||
|
|> string.join(with: "")
|
||||||
|
}
|
||||||
|
|
||||||
|
// The logic behind this function is exactly the same as the erlang one: we
|
||||||
|
// iterate the string byte by byte and only ever take slices of it (constant
|
||||||
|
// time operation that ensures maximum sharing). However, this implementation is
|
||||||
|
// a little more convoluted since we cannot define it as two mutually recursive
|
||||||
|
// functions as we did with the Erlang one (or it won't be tail call optimised
|
||||||
|
// on the JS target).
|
||||||
|
fn do_escape(
|
||||||
|
string: String,
|
||||||
|
skip: Int,
|
||||||
|
original: String,
|
||||||
|
acc: List(String),
|
||||||
|
len: Int,
|
||||||
|
found_normal: Bool,
|
||||||
|
) -> List(String) {
|
||||||
|
case found_normal, first(string) {
|
||||||
|
False, "<" -> {
|
||||||
|
let rest = drop_first(string)
|
||||||
|
let acc = ["<", ..acc]
|
||||||
|
do_escape(rest, skip + 1, original, acc, 0, False)
|
||||||
|
}
|
||||||
|
|
||||||
|
False, ">" -> {
|
||||||
|
let rest = drop_first(string)
|
||||||
|
let acc = [">", ..acc]
|
||||||
|
do_escape(rest, skip + 1, original, acc, 0, False)
|
||||||
|
}
|
||||||
|
|
||||||
|
False, "&" -> {
|
||||||
|
let rest = drop_first(string)
|
||||||
|
let acc = ["&", ..acc]
|
||||||
|
do_escape(rest, skip + 1, original, acc, 0, False)
|
||||||
|
}
|
||||||
|
|
||||||
|
False, "\"" -> {
|
||||||
|
let rest = drop_first(string)
|
||||||
|
let acc = [""", ..acc]
|
||||||
|
do_escape(rest, skip + 1, original, acc, 0, False)
|
||||||
|
}
|
||||||
|
|
||||||
|
False, "'" -> {
|
||||||
|
let rest = drop_first(string)
|
||||||
|
let acc = ["'", ..acc]
|
||||||
|
do_escape(rest, skip + 1, original, acc, 0, False)
|
||||||
|
}
|
||||||
|
|
||||||
|
False, "" -> acc
|
||||||
|
|
||||||
|
// For any other bit that doesn't need to be escaped we go into an inner
|
||||||
|
// loop, consuming as much "non-escapable" chars as possible.
|
||||||
|
False, _ -> {
|
||||||
|
let rest = drop_first(string)
|
||||||
|
do_escape(rest, skip, original, acc, 1, True)
|
||||||
|
}
|
||||||
|
|
||||||
|
True, "<" -> {
|
||||||
|
let rest = drop_first(string)
|
||||||
|
let slice = slice(original, skip, len)
|
||||||
|
let acc = ["<", slice, ..acc]
|
||||||
|
do_escape(rest, skip + len + 1, original, acc, 0, False)
|
||||||
|
}
|
||||||
|
|
||||||
|
True, ">" -> {
|
||||||
|
let rest = drop_first(string)
|
||||||
|
let slice = slice(original, skip, len)
|
||||||
|
let acc = [">", slice, ..acc]
|
||||||
|
do_escape(rest, skip + len + 1, original, acc, 0, False)
|
||||||
|
}
|
||||||
|
|
||||||
|
True, "&" -> {
|
||||||
|
let rest = drop_first(string)
|
||||||
|
let slice = slice(original, skip, len)
|
||||||
|
let acc = ["&", slice, ..acc]
|
||||||
|
do_escape(rest, skip + len + 1, original, acc, 0, False)
|
||||||
|
}
|
||||||
|
|
||||||
|
True, "\"" -> {
|
||||||
|
let rest = drop_first(string)
|
||||||
|
let slice = slice(original, skip, len)
|
||||||
|
let acc = [""", slice, ..acc]
|
||||||
|
do_escape(rest, skip + len + 1, original, acc, 0, False)
|
||||||
|
}
|
||||||
|
|
||||||
|
True, "'" -> {
|
||||||
|
let rest = drop_first(string)
|
||||||
|
let slice = slice(original, skip, len)
|
||||||
|
let acc = ["'", slice, ..acc]
|
||||||
|
do_escape(rest, skip + len + 1, original, acc, 0, False)
|
||||||
|
}
|
||||||
|
|
||||||
|
True, "" ->
|
||||||
|
case skip {
|
||||||
|
0 -> [original]
|
||||||
|
_ -> {
|
||||||
|
let slice = slice(original, skip, len)
|
||||||
|
[slice, ..acc]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If a char doesn't need escaping we keep increasing the length of the
|
||||||
|
// slice we're going to take.
|
||||||
|
True, _ -> {
|
||||||
|
let rest = drop_first(string)
|
||||||
|
do_escape(rest, skip, original, acc, len + 1, True)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@external(erlang, "glentities_ffi", "first")
|
||||||
|
@external(javascript, "../../../glentities_ffi.mjs", "first")
|
||||||
|
fn first(_string: String) -> String
|
||||||
|
|
||||||
|
@external(erlang, "glentities_ffi", "drop_first")
|
||||||
|
@external(javascript, "../../../glentities_ffi.mjs", "drop_first")
|
||||||
|
fn drop_first(_string: String) -> String
|
||||||
|
|
||||||
|
@external(erlang, "glentities_ffi", "slice")
|
||||||
|
@external(javascript, "../../../glentities_ffi.mjs", "slice")
|
||||||
|
fn slice(_string: String, _from: Int, _to: Int) -> String
|
|
@ -8,5 +8,5 @@ pub fn normalise(text: String) -> String {
|
||||||
}
|
}
|
||||||
|
|
||||||
@target(javascript)
|
@target(javascript)
|
||||||
@external(javascript, "../../ffi_js.mjs", "normalize")
|
@external(javascript, "../../glentities_ffi.mjs", "normalize")
|
||||||
pub fn normalise_js(text text: String, mode mode: String) -> String
|
pub fn normalise_js(text text: String, mode mode: String) -> String
|
||||||
|
|
20
src/glentities_ffi.erl
Normal file
20
src/glentities_ffi.erl
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
-module(glentities_ffi).
|
||||||
|
-export([coerce/1, slice/3, first/1, drop_first/1]).
|
||||||
|
|
||||||
|
coerce(X) -> X.
|
||||||
|
|
||||||
|
slice(String, From, Len) ->
|
||||||
|
binary:part(String, From, Len).
|
||||||
|
|
||||||
|
first(String) ->
|
||||||
|
case String of
|
||||||
|
<<>> -> <<>>;
|
||||||
|
<<First, _/bitstring>> ->
|
||||||
|
<<First>>
|
||||||
|
end.
|
||||||
|
|
||||||
|
drop_first(String) ->
|
||||||
|
case String of
|
||||||
|
<<>> -> <<>>;
|
||||||
|
<<_, Rest/bitstring>> -> Rest
|
||||||
|
end.
|
15
src/glentities_ffi.mjs
Normal file
15
src/glentities_ffi.mjs
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
export function normalize(str, mode) {
|
||||||
|
return str.normalize(mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function slice(string, from, len) {
|
||||||
|
return string.slice(from, from + len);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function first(string) {
|
||||||
|
return string.slice(0, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function drop_first(string) {
|
||||||
|
return string.slice(1);
|
||||||
|
}
|
|
@ -56,8 +56,8 @@ pub fn roundtrip_named_test() {
|
||||||
|
|
||||||
should.equal(
|
should.equal(
|
||||||
input
|
input
|
||||||
|> glentities.encode(glentities.Named)
|
|> glentities.encode(glentities.Named)
|
||||||
|> glentities.decode(),
|
|> glentities.decode(),
|
||||||
input,
|
input,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
@ -70,8 +70,8 @@ pub fn roundtrip_hex_test() {
|
||||||
|
|
||||||
should.equal(
|
should.equal(
|
||||||
input
|
input
|
||||||
|> glentities.encode(glentities.Hex)
|
|> glentities.encode(glentities.Hex)
|
||||||
|> glentities.decode(),
|
|> glentities.decode(),
|
||||||
input,
|
input,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
@ -85,22 +85,22 @@ pub fn tco_test() {
|
||||||
|
|
||||||
should.equal(
|
should.equal(
|
||||||
input
|
input
|
||||||
|> glentities.encode(glentities.Hex)
|
|> glentities.encode(glentities.Hex)
|
||||||
|> glentities.decode(),
|
|> glentities.decode(),
|
||||||
input,
|
input,
|
||||||
)
|
)
|
||||||
|
|
||||||
should.equal(
|
should.equal(
|
||||||
input
|
input
|
||||||
|> glentities.encode(glentities.Named)
|
|> glentities.encode(glentities.Named)
|
||||||
|> glentities.decode(),
|
|> glentities.decode(),
|
||||||
input,
|
input,
|
||||||
)
|
)
|
||||||
|
|
||||||
should.equal(
|
should.equal(
|
||||||
input
|
input
|
||||||
|> glentities.encode(glentities.HTMLBody)
|
|> glentities.encode(glentities.HTMLBody)
|
||||||
|> glentities.decode(),
|
|> glentities.decode(),
|
||||||
input,
|
input,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue