Many new improvements, bump version to 1.1.0.
* Refactor numeric decode regex to module attribute, avoid recreating it with every call. * Use character table also when encoding. This results in some compile warns, but the code works as expected. Removing the warnings is for TODO. * Bump elixir version to 0.11.x * Write tests for encoding and decoding
This commit is contained in:
parent
d96eae16d3
commit
5990f698fe
4 changed files with 111 additions and 18 deletions
|
@ -3,7 +3,10 @@ defmodule ExCoder do
|
||||||
# Change this in case the definition of UTF-8 changes for some reason :)
|
# Change this in case the definition of UTF-8 changes for some reason :)
|
||||||
@max_codepoint 1114111
|
@max_codepoint 1114111
|
||||||
|
|
||||||
decode_table = [
|
# Regex for matching numeric entities
|
||||||
|
@numeric_decode_regex %R/^(?:\d+|x[\da-f]+);/i
|
||||||
|
|
||||||
|
codec_table = [
|
||||||
{ "	", " " },
|
{ "	", " " },
|
||||||
{"
", "
|
{"
", "
|
||||||
"},
|
"},
|
||||||
|
@ -2038,15 +2041,19 @@ defmodule ExCoder do
|
||||||
{ "𝕫", "𝕫" }
|
{ "𝕫", "𝕫" }
|
||||||
]
|
]
|
||||||
|
|
||||||
lc { entity, value } inlist decode_table do
|
|
||||||
def decode(<< unquote(entity), rest :: binary >>), do: unquote(value) <> decode rest
|
|
||||||
|
# DECODING
|
||||||
|
|
||||||
|
# Generate a decoding function for each element in the replacement table above
|
||||||
|
lc { entity, char } inlist codec_table do
|
||||||
|
def decode(<< unquote(entity), rest :: binary >>), do: unquote(char) <> decode rest
|
||||||
end
|
end
|
||||||
|
|
||||||
def decode(""), do: ""
|
def decode(""), do: ""
|
||||||
|
|
||||||
def decode(<< "&#", rest :: binary >>) do
|
def decode(<< "&#", rest :: binary >>) do
|
||||||
r = %R/^(?:\d+|x[\da-f]+);/i
|
if Regex.match? @numeric_decode_regex, rest do
|
||||||
if Regex.match? r, rest do
|
|
||||||
decode rest, :numericmode
|
decode rest, :numericmode
|
||||||
else
|
else
|
||||||
"&#" <> decode rest
|
"&#" <> decode rest
|
||||||
|
@ -2077,22 +2084,35 @@ defmodule ExCoder do
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def encode(str) when is_binary str do
|
# ENCODING
|
||||||
String.codepoints(str) |> encode
|
|
||||||
|
# Skip newline when encoding
|
||||||
|
def encode(<< "
|
||||||
|
", rest :: binary >>), do: "
|
||||||
|
" <> encode rest
|
||||||
|
|
||||||
|
# Replace anything on the codec table with the named entity version
|
||||||
|
lc { entity, char } inlist codec_table do
|
||||||
|
def encode(<< unquote(char), rest :: binary >>), do: unquote(entity) <> encode rest
|
||||||
end
|
end
|
||||||
|
|
||||||
def encode([]) do
|
def encode(<< codepoint :: utf8, rest :: binary >>) do
|
||||||
|
encode(codepoint) <> encode rest
|
||||||
|
end
|
||||||
|
|
||||||
|
# Encode a-z as is
|
||||||
|
def encode(codepoint) when is_number(codepoint) and codepoint >= 32 and codepoint <= 126 do
|
||||||
|
<< codepoint :: utf8 >>
|
||||||
|
end
|
||||||
|
|
||||||
|
def encode(codepoint) when is_number codepoint do
|
||||||
|
"&#x" <> integer_to_binary(codepoint, 16) <> ";"
|
||||||
|
end
|
||||||
|
|
||||||
|
def encode("") do
|
||||||
""
|
""
|
||||||
end
|
end
|
||||||
|
|
||||||
def encode([<< codepoint :: utf8 >> | rest]) when codepoint >= 32 and codepoint <= 126 do
|
|
||||||
<< codepoint :: utf8 >> <> encode rest
|
|
||||||
end
|
|
||||||
|
|
||||||
def encode([<< codepoint :: utf8 >> | rest]) do
|
|
||||||
"&#x" <> integer_to_binary(codepoint, 16) <> ";" <> encode rest
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
4
mix.exs
4
mix.exs
|
@ -3,8 +3,8 @@ defmodule Excoder.Mixfile do
|
||||||
|
|
||||||
def project do
|
def project do
|
||||||
[ app: :excoder,
|
[ app: :excoder,
|
||||||
version: "1.0.0",
|
version: "1.1.0",
|
||||||
elixir: "~> 0.10.1-dev",
|
elixir: "~> 0.11",
|
||||||
deps: deps ]
|
deps: deps ]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
72
test/excoder_test.exs
Normal file
72
test/excoder_test.exs
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
Code.require_file "test_helper.exs", __DIR__
|
||||||
|
|
||||||
|
defmodule ExcoderTest do
|
||||||
|
use ExUnit.Case
|
||||||
|
|
||||||
|
test "Decode single entity" do
|
||||||
|
assert(ExCoder.decode("&") == "&")
|
||||||
|
end
|
||||||
|
|
||||||
|
test "Decode numeric entity" do
|
||||||
|
assert(ExCoder.decode("ᄀ") == "ᄀ")
|
||||||
|
end
|
||||||
|
|
||||||
|
test "Decode hexadecimal entity" do
|
||||||
|
assert(ExCoder.decode("ģ") == "ģ")
|
||||||
|
end
|
||||||
|
|
||||||
|
test "Decode invalid hexadecimal" do
|
||||||
|
assert(ExCoder.decode("h23;") == "h23;")
|
||||||
|
end
|
||||||
|
|
||||||
|
test "Decode invalid numeric" do
|
||||||
|
assert(ExCoder.decode("Džg3;") == "Džg3;")
|
||||||
|
end
|
||||||
|
|
||||||
|
test "Decode HTML" do
|
||||||
|
assert(ExCoder.decode("""
|
||||||
|
<div class='line' id='LC9'> <span class="ss">address:</span> <span class="s2">"chat.eu.freenode.net"</span><span class="p">,</span></div><div class='line' id='LC10'> <span class="ss">
|
||||||
|
"""
|
||||||
|
) == """
|
||||||
|
<div class='line' id='LC9'> <span class="ss">address:</span> <span class="s2">"chat.eu.freenode.net"</span><span class="p">,</span></div><div class='line' id='LC10'> <span class="ss">
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
test "Decode invalid case" do
|
||||||
|
assert(ExCoder.decode("&AOPF;") == "&AOPF;")
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
test "Encode simple text" do
|
||||||
|
assert(ExCoder.encode("foo and bar") == "foo and bar")
|
||||||
|
end
|
||||||
|
|
||||||
|
test "Encode linebreak" do
|
||||||
|
assert(ExCoder.encode("
|
||||||
|
foo
|
||||||
|
and
|
||||||
|
bar
|
||||||
|
") == "
|
||||||
|
foo
|
||||||
|
and
|
||||||
|
bar
|
||||||
|
")
|
||||||
|
end
|
||||||
|
|
||||||
|
test "Encode scandinavian characters" do
|
||||||
|
assert(ExCoder.encode("Hääyöaie liittyy öylättiin, sanoi Åke.") == "Hääyöaie liittyy öylättiin, sanoi Åke.")
|
||||||
|
end
|
||||||
|
|
||||||
|
test "Encode apple signs" do
|
||||||
|
assert(ExCoder.encode("⌘⌥⇧ and the last one is ⎋") == "⌘⌥⇧ and the last one is ⎋")
|
||||||
|
end
|
||||||
|
|
||||||
|
test "Encode decode roundtrip with very difficult unicode (HE COMES)" do
|
||||||
|
zalgo = "Z͉̘͕͐ḁ̗̕ͅl̟̥̳̞̞͔ͬ̔̂̋̾gͮͫ̓̓̕o͎͉̹͕͌͂̍͐̌̋ ͕̖͖ͯĩ͇̹̬̤͕̟ͭ̇ș̼̠̹̒̂ͭͯ̓ ͉̪͍͚̗̟͚ḟ̴̙̟̯͚̭̳̼̈́û̔ͥ̒nͥͩͭ̎̃!̫͔ ̲̯̰̰̗͔ͪ͝R̭͉̬͓̜͉ͮͬͤ̃ͯ͊a̡̩̍͊i̖̺̝̻ͦ̋̾͑͌̇ņ͍̖̟͚͓ͧ͒b̻̹͉͉̘̎̄ͬ̆͗͠ơ̮̱͉͓̗̝̯ẅ͍̱́̋ͧͭ̾̐̇s̯͓̦͓̦͇͚̎ͦ̌͐̾ ̟̞̑̉̈̎̈́l̯͉̯ͭ͡ȏ̟͙̯̞̫̳̮͒ͭ̎͆l̼͒ͩ̌̀͒l̒͒ͭ̌ͩ̎͘ͅi̝̲ͬ̋̾̉̋p̢̣͉͒̾͗ͨͯö͙̳̘̜̓ͣ͒́̿ͫͅp̵͔͎͓sͅ.̶̹͙̼̺ͧ͂̒"
|
||||||
|
assert(ExCoder.decode(ExCoder.encode(zalgo)) == zalgo)
|
||||||
|
end
|
||||||
|
end
|
1
test/test_helper.exs
Normal file
1
test/test_helper.exs
Normal file
|
@ -0,0 +1 @@
|
||||||
|
ExUnit.start
|
Loading…
Reference in a new issue