Many new improvements, bump version to 1.1.0.

* Refactor numeric decode regex to module attribute, avoid recreating it with
  every call.
* Use character table also when encoding. This results in some compile warns,
  but the code works as expected. Removing the warnings is for TODO.
* Bump elixir version to 0.11.x
* Write tests for encoding and decoding
This commit is contained in:
Mikko Ahlroth 2013-11-25 21:34:39 +02:00
parent d96eae16d3
commit 5990f698fe
4 changed files with 111 additions and 18 deletions

View file

@ -3,7 +3,10 @@ defmodule ExCoder do
# Change this in case the definition of UTF-8 changes for some reason :)
@max_codepoint 1114111
decode_table = [
# Regex for matching numeric entities
@numeric_decode_regex %R/^(?:\d+|x[\da-f]+);/i
codec_table = [
{ "	", " " },
{"
", "
"},
@ -2038,15 +2041,19 @@ defmodule ExCoder do
{ "𝕫", "𝕫" }
]
lc { entity, value } inlist decode_table do
def decode(<< unquote(entity), rest :: binary >>), do: unquote(value) <> decode rest
# DECODING
# Generate a decoding function for each element in the replacement table above
lc { entity, char } inlist codec_table do
def decode(<< unquote(entity), rest :: binary >>), do: unquote(char) <> decode rest
end
def decode(""), do: ""
def decode(<< "&#", rest :: binary >>) do
r = %R/^(?:\d+|x[\da-f]+);/i
if Regex.match? r, rest do
if Regex.match? @numeric_decode_regex, rest do
decode rest, :numericmode
else
"&#" <> decode rest
@ -2077,22 +2084,35 @@ defmodule ExCoder do
def encode(str) when is_binary str do
String.codepoints(str) |> encode
# ENCODING
# Skip newline when encoding
def encode(<< "
", rest :: binary >>), do: "
" <> encode rest
# Replace anything on the codec table with the named entity version
lc { entity, char } inlist codec_table do
def encode(<< unquote(char), rest :: binary >>), do: unquote(entity) <> encode rest
end
def encode([]) do
def encode(<< codepoint :: utf8, rest :: binary >>) do
encode(codepoint) <> encode rest
end
# Encode a-z as is
def encode(codepoint) when is_number(codepoint) and codepoint >= 32 and codepoint <= 126 do
<< codepoint :: utf8 >>
end
def encode(codepoint) when is_number codepoint do
"&#x" <> integer_to_binary(codepoint, 16) <> ";"
end
def encode("") do
""
end
def encode([<< codepoint :: utf8 >> | rest]) when codepoint >= 32 and codepoint <= 126 do
<< codepoint :: utf8 >> <> encode rest
end
def encode([<< codepoint :: utf8 >> | rest]) do
"&#x" <> integer_to_binary(codepoint, 16) <> ";" <> encode rest
end

View file

@ -3,8 +3,8 @@ defmodule Excoder.Mixfile do
def project do
[ app: :excoder,
version: "1.0.0",
elixir: "~> 0.10.1-dev",
version: "1.1.0",
elixir: "~> 0.11",
deps: deps ]
end

72
test/excoder_test.exs Normal file
View file

@ -0,0 +1,72 @@
Code.require_file "test_helper.exs", __DIR__
defmodule ExcoderTest do
use ExUnit.Case
test "Decode single entity" do
assert(ExCoder.decode("&amp;") == "&")
end
test "Decode numeric entity" do
assert(ExCoder.decode("&#4352;") == "")
end
test "Decode hexadecimal entity" do
assert(ExCoder.decode("&#x123;") == "ģ")
end
test "Decode invalid hexadecimal" do
assert(ExCoder.decode("&#x1h23;") == "&#x1h23;")
end
test "Decode invalid numeric" do
assert(ExCoder.decode("&#453g3;") == "&#453g3;")
end
test "Decode HTML" do
assert(ExCoder.decode("""
<div class='line' id='LC9'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="ss">address:</span> <span class="s2">&quot;chat.eu.freenode.net&quot;</span><span class="p">,</span></div><div class='line' id='LC10'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="ss">
"""
) == """
<div class='line' id='LC9'>      <span class="ss">address:</span> <span class="s2">"chat.eu.freenode.net"</span><span class="p">,</span></div><div class='line' id='LC10'>      <span class="ss">
"""
)
end
test "Decode invalid case" do
assert(ExCoder.decode("&AOPF;") == "&AOPF;")
end
test "Encode simple text" do
assert(ExCoder.encode("foo and bar") == "foo and bar")
end
test "Encode linebreak" do
assert(ExCoder.encode("
foo
and
bar
") == "
foo
and
bar
")
end
test "Encode scandinavian characters" do
assert(ExCoder.encode("Hääyöaie liittyy öylättiin, sanoi Åke.") == "H&auml;&auml;y&ouml;aie liittyy &ouml;yl&auml;ttiin&comma; sanoi &Aring;ke&period;")
end
test "Encode apple signs" do
assert(ExCoder.encode("⌘⌥⇧ and the last one is ⎋") == "&#xF8FF;&#x2318;&#x2325;&#x21E7; and the last one is &#x238B;")
end
test "Encode decode roundtrip with very difficult unicode (HE COMES)" do
zalgo = "Z͉̘͕͐ḁ̗̕ͅl̟̥̳̞̞͔ͬ̔̂̋̾gͮͫ̓̓̕o͎͉̹͕͌͂̍͐̌̋ ͕̖͖ͯĩ͇̹̬̤͕̟ͭ̇ș̼̠̹̒̂ͭͯ̓ ͉̪͍͚̗̟͚ḟ̴̙̟̯͚̭̳̼̈́û̔ͥ̒nͥͩͭ̎̃!̫͔ ̲̯̰̰̗͔ͪ͝R̭͉̬͓̜͉ͮͬͤ̃ͯ͊a̡̩̍͊i̖̺̝̻ͦ̋̾͑͌̇ņ͍̖̟͚͓ͧ͒b̻̹͉͉̘̎̄ͬ̆͗͠ơ̮̱͉͓̗̝̯ẅ͍̱́̋ͧͭ̾̐̇s̯͓̦͓̦͇͚̎ͦ̌͐̾ ̟̞̑̉̈̎̈́l̯͉̯ͭ͡ȏ̟͙̯̞̫̳̮͒ͭ̎͆l̼͒ͩ̌̀͒l̒͒ͭ̌ͩ̎͘ͅi̝̲ͬ̋̾̉̋p̢̣͉͒̾͗ͨͯö͙̳̘̜̓ͣ͒́̿ͫͅp̵͔͎͓sͅ.̶̹͙̼̺ͧ͂̒"
assert(ExCoder.decode(ExCoder.encode(zalgo)) == zalgo)
end
end

1
test/test_helper.exs Normal file
View file

@ -0,0 +1 @@
ExUnit.start