Many new improvements, bump version to 1.1.0.
* Refactor numeric decode regex to module attribute, avoid recreating it with every call. * Use character table also when encoding. This results in some compile warns, but the code works as expected. Removing the warnings is for TODO. * Bump elixir version to 0.11.x * Write tests for encoding and decoding
This commit is contained in:
parent
d96eae16d3
commit
5990f698fe
4 changed files with 111 additions and 18 deletions
|
@ -3,7 +3,10 @@ defmodule ExCoder do
|
|||
# Change this in case the definition of UTF-8 changes for some reason :)
|
||||
@max_codepoint 1114111
|
||||
|
||||
decode_table = [
|
||||
# Regex for matching numeric entities
|
||||
@numeric_decode_regex %R/^(?:\d+|x[\da-f]+);/i
|
||||
|
||||
codec_table = [
|
||||
{ "	", " " },
|
||||
{"
", "
|
||||
"},
|
||||
|
@ -2038,15 +2041,19 @@ defmodule ExCoder do
|
|||
{ "𝕫", "𝕫" }
|
||||
]
|
||||
|
||||
lc { entity, value } inlist decode_table do
|
||||
def decode(<< unquote(entity), rest :: binary >>), do: unquote(value) <> decode rest
|
||||
|
||||
|
||||
# DECODING
|
||||
|
||||
# Generate a decoding function for each element in the replacement table above
|
||||
lc { entity, char } inlist codec_table do
|
||||
def decode(<< unquote(entity), rest :: binary >>), do: unquote(char) <> decode rest
|
||||
end
|
||||
|
||||
def decode(""), do: ""
|
||||
|
||||
def decode(<< "&#", rest :: binary >>) do
|
||||
r = %R/^(?:\d+|x[\da-f]+);/i
|
||||
if Regex.match? r, rest do
|
||||
if Regex.match? @numeric_decode_regex, rest do
|
||||
decode rest, :numericmode
|
||||
else
|
||||
"&#" <> decode rest
|
||||
|
@ -2077,22 +2084,35 @@ defmodule ExCoder do
|
|||
|
||||
|
||||
|
||||
def encode(str) when is_binary str do
|
||||
String.codepoints(str) |> encode
|
||||
# ENCODING
|
||||
|
||||
# Skip newline when encoding
|
||||
def encode(<< "
|
||||
", rest :: binary >>), do: "
|
||||
" <> encode rest
|
||||
|
||||
# Replace anything on the codec table with the named entity version
|
||||
lc { entity, char } inlist codec_table do
|
||||
def encode(<< unquote(char), rest :: binary >>), do: unquote(entity) <> encode rest
|
||||
end
|
||||
|
||||
def encode([]) do
|
||||
def encode(<< codepoint :: utf8, rest :: binary >>) do
|
||||
encode(codepoint) <> encode rest
|
||||
end
|
||||
|
||||
# Encode a-z as is
|
||||
def encode(codepoint) when is_number(codepoint) and codepoint >= 32 and codepoint <= 126 do
|
||||
<< codepoint :: utf8 >>
|
||||
end
|
||||
|
||||
def encode(codepoint) when is_number codepoint do
|
||||
"&#x" <> integer_to_binary(codepoint, 16) <> ";"
|
||||
end
|
||||
|
||||
def encode("") do
|
||||
""
|
||||
end
|
||||
|
||||
def encode([<< codepoint :: utf8 >> | rest]) when codepoint >= 32 and codepoint <= 126 do
|
||||
<< codepoint :: utf8 >> <> encode rest
|
||||
end
|
||||
|
||||
def encode([<< codepoint :: utf8 >> | rest]) do
|
||||
"&#x" <> integer_to_binary(codepoint, 16) <> ";" <> encode rest
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
4
mix.exs
4
mix.exs
|
@ -3,8 +3,8 @@ defmodule Excoder.Mixfile do
|
|||
|
||||
def project do
|
||||
[ app: :excoder,
|
||||
version: "1.0.0",
|
||||
elixir: "~> 0.10.1-dev",
|
||||
version: "1.1.0",
|
||||
elixir: "~> 0.11",
|
||||
deps: deps ]
|
||||
end
|
||||
|
||||
|
|
72
test/excoder_test.exs
Normal file
72
test/excoder_test.exs
Normal file
|
@ -0,0 +1,72 @@
|
|||
Code.require_file "test_helper.exs", __DIR__
|
||||
|
||||
defmodule ExcoderTest do
|
||||
use ExUnit.Case
|
||||
|
||||
test "Decode single entity" do
|
||||
assert(ExCoder.decode("&") == "&")
|
||||
end
|
||||
|
||||
test "Decode numeric entity" do
|
||||
assert(ExCoder.decode("ᄀ") == "ᄀ")
|
||||
end
|
||||
|
||||
test "Decode hexadecimal entity" do
|
||||
assert(ExCoder.decode("ģ") == "ģ")
|
||||
end
|
||||
|
||||
test "Decode invalid hexadecimal" do
|
||||
assert(ExCoder.decode("h23;") == "h23;")
|
||||
end
|
||||
|
||||
test "Decode invalid numeric" do
|
||||
assert(ExCoder.decode("Džg3;") == "Džg3;")
|
||||
end
|
||||
|
||||
test "Decode HTML" do
|
||||
assert(ExCoder.decode("""
|
||||
<div class='line' id='LC9'> <span class="ss">address:</span> <span class="s2">"chat.eu.freenode.net"</span><span class="p">,</span></div><div class='line' id='LC10'> <span class="ss">
|
||||
"""
|
||||
) == """
|
||||
<div class='line' id='LC9'> <span class="ss">address:</span> <span class="s2">"chat.eu.freenode.net"</span><span class="p">,</span></div><div class='line' id='LC10'> <span class="ss">
|
||||
"""
|
||||
)
|
||||
end
|
||||
|
||||
test "Decode invalid case" do
|
||||
assert(ExCoder.decode("&AOPF;") == "&AOPF;")
|
||||
end
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
test "Encode simple text" do
|
||||
assert(ExCoder.encode("foo and bar") == "foo and bar")
|
||||
end
|
||||
|
||||
test "Encode linebreak" do
|
||||
assert(ExCoder.encode("
|
||||
foo
|
||||
and
|
||||
bar
|
||||
") == "
|
||||
foo
|
||||
and
|
||||
bar
|
||||
")
|
||||
end
|
||||
|
||||
test "Encode scandinavian characters" do
|
||||
assert(ExCoder.encode("Hääyöaie liittyy öylättiin, sanoi Åke.") == "Hääyöaie liittyy öylättiin, sanoi Åke.")
|
||||
end
|
||||
|
||||
test "Encode apple signs" do
|
||||
assert(ExCoder.encode("⌘⌥⇧ and the last one is ⎋") == "⌘⌥⇧ and the last one is ⎋")
|
||||
end
|
||||
|
||||
test "Encode decode roundtrip with very difficult unicode (HE COMES)" do
|
||||
zalgo = "Z͉̘͕͐ḁ̗̕ͅl̟̥̳̞̞͔ͬ̔̂̋̾gͮͫ̓̓̕o͎͉̹͕͌͂̍͐̌̋ ͕̖͖ͯĩ͇̹̬̤͕̟ͭ̇ș̼̠̹̒̂ͭͯ̓ ͉̪͍͚̗̟͚ḟ̴̙̟̯͚̭̳̼̈́û̔ͥ̒nͥͩͭ̎̃!̫͔ ̲̯̰̰̗͔ͪ͝R̭͉̬͓̜͉ͮͬͤ̃ͯ͊a̡̩̍͊i̖̺̝̻ͦ̋̾͑͌̇ņ͍̖̟͚͓ͧ͒b̻̹͉͉̘̎̄ͬ̆͗͠ơ̮̱͉͓̗̝̯ẅ͍̱́̋ͧͭ̾̐̇s̯͓̦͓̦͇͚̎ͦ̌͐̾ ̟̞̑̉̈̎̈́l̯͉̯ͭ͡ȏ̟͙̯̞̫̳̮͒ͭ̎͆l̼͒ͩ̌̀͒l̒͒ͭ̌ͩ̎͘ͅi̝̲ͬ̋̾̉̋p̢̣͉͒̾͗ͨͯö͙̳̘̜̓ͣ͒́̿ͫͅp̵͔͎͓sͅ.̶̹͙̼̺ͧ͂̒"
|
||||
assert(ExCoder.decode(ExCoder.encode(zalgo)) == zalgo)
|
||||
end
|
||||
end
|
1
test/test_helper.exs
Normal file
1
test/test_helper.exs
Normal file
|
@ -0,0 +1 @@
|
|||
ExUnit.start
|
Loading…
Reference in a new issue