Many new improvements, bump version to 1.1.0.

* Refactor numeric decode regex to module attribute, avoid recreating it with every call. * Use character table also when encoding. This results in some compile warns, but the code works as expected. Removing the warnings is for TODO. * Bump elixir version to 0.11.x * Write tests for encoding and decoding
2013-11-25 21:34:39 +02:00 · 2013-11-25 21:34:39 +02:00 · 5990f698fe
commit 5990f698fe
parent d96eae16d3
4 changed files with 111 additions and 18 deletions
--- a/lib/excoder.ex
+++ b/lib/excoder.ex
@ -3,7 +3,10 @@ defmodule ExCoder do
    # Change this in case the definition of UTF-8 changes for some reason :)
    @max_codepoint 1114111

-    decode_table = [
+    # Regex for matching numeric entities
+    @numeric_decode_regex %R/^(?:\d+|x[\da-f]+);/i
+
+    codec_table = [
        { "&Tab;", "  " },
        {"&NewLine;", "
 "},
@ -2038,15 +2041,19 @@ defmodule ExCoder do
        { "&zopf;", "𝕫" }
    ]

-    lc { entity, value } inlist decode_table do
-        def decode(<< unquote(entity), rest :: binary >>), do: unquote(value) <> decode rest
+
+
+    # DECODING
+
+    # Generate a decoding function for each element in the replacement table above
+    lc { entity, char } inlist codec_table do
+        def decode(<< unquote(entity), rest :: binary >>), do: unquote(char) <> decode rest
    end

    def decode(""), do: ""

    def decode(<< "&#", rest :: binary >>) do
-        r = %R/^(?:\d+|x[\da-f]+);/i
-        if Regex.match? r, rest do
+        if Regex.match? @numeric_decode_regex, rest do
            decode rest, :numericmode
        else
            "&#" <> decode rest
@ -2077,22 +2084,35 @@ defmodule ExCoder do



-    def encode(str) when is_binary str do
-        String.codepoints(str) |> encode
+    # ENCODING
+
+    # Skip newline when encoding
+    def encode(<< "
+", rest :: binary >>), do: "
+" <> encode rest 
+
+    # Replace anything on the codec table with the named entity version
+    lc { entity, char } inlist codec_table do
+        def encode(<< unquote(char), rest :: binary >>), do: unquote(entity) <> encode rest
    end

-    def encode([]) do
+    def encode(<< codepoint :: utf8, rest :: binary >>) do
+        encode(codepoint) <> encode rest
+    end
+
+    # Encode a-z as is
+    def encode(codepoint) when is_number(codepoint) and codepoint >= 32 and codepoint <= 126 do
+        << codepoint :: utf8 >>
+    end
+
+    def encode(codepoint) when is_number codepoint do
+        "&#x" <> integer_to_binary(codepoint, 16) <> ";"
+    end
+
+    def encode("") do
        ""
    end

-    def encode([<< codepoint :: utf8 >> | rest]) when codepoint >= 32 and codepoint <= 126 do
-        << codepoint :: utf8 >> <> encode rest
-    end
-
-    def encode([<< codepoint :: utf8 >> | rest]) do
-        "&#x" <> integer_to_binary(codepoint, 16) <> ";" <> encode rest
-    end
-



--- a/mix.exs
+++ b/mix.exs
@ -3,8 +3,8 @@ defmodule Excoder.Mixfile do

  def project do
    [ app: :excoder,
-      version: "1.0.0",
-      elixir: "~> 0.10.1-dev",
+      version: "1.1.0",
+      elixir: "~> 0.11",
      deps: deps ]
  end

--- a/test/excoder_test.exs
+++ b/test/excoder_test.exs
@ -0,0 +1,72 @@
+Code.require_file "test_helper.exs", __DIR__
+
+defmodule ExcoderTest do
+    use ExUnit.Case
+
+    test "Decode single entity" do
+        assert(ExCoder.decode("&amp;") == "&")
+    end
+
+    test "Decode numeric entity" do
+        assert(ExCoder.decode("&#4352;") == "ᄀ")
+    end
+
+    test "Decode hexadecimal entity" do
+        assert(ExCoder.decode("&#x123;") == "ģ")
+    end
+
+    test "Decode invalid hexadecimal" do
+        assert(ExCoder.decode("&#x1h23;") == "&#x1h23;")
+    end
+
+    test "Decode invalid numeric" do
+        assert(ExCoder.decode("&#453g3;") == "&#453g3;")
+    end
+
+    test "Decode HTML" do
+        assert(ExCoder.decode("""
+<div class='line' id='LC9'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="ss">address:</span> <span class="s2">&quot;chat.eu.freenode.net&quot;</span><span class="p">,</span></div><div class='line' id='LC10'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="ss">
+"""
+) == """
+<div class='line' id='LC9'>      <span class="ss">address:</span> <span class="s2">"chat.eu.freenode.net"</span><span class="p">,</span></div><div class='line' id='LC10'>      <span class="ss">
+"""
+        )
+    end
+
+    test "Decode invalid case" do
+        assert(ExCoder.decode("&AOPF;") == "&AOPF;")
+    end
+
+
+
+
+
+    test "Encode simple text" do
+        assert(ExCoder.encode("foo and bar") == "foo and bar")
+    end
+
+    test "Encode linebreak" do
+        assert(ExCoder.encode("
+foo
+and
+bar
+") == "
+foo
+and
+bar
+")
+    end
+
+    test "Encode scandinavian characters" do
+        assert(ExCoder.encode("Hääyöaie liittyy öylättiin, sanoi Åke.") == "H&auml;&auml;y&ouml;aie liittyy &ouml;yl&auml;ttiin&comma; sanoi &Aring;ke&period;")
+    end
+
+    test "Encode apple signs" do
+        assert(ExCoder.encode("⌘⌥⇧ and the last one is ⎋") == "&#xF8FF;&#x2318;&#x2325;&#x21E7; and the last one is &#x238B;")
+    end
+
+    test "Encode decode roundtrip with very difficult unicode (HE COMES)" do
+        zalgo = "Z͉̘͕͐ḁ̗̕ͅl̟̥̳̞̞͔ͬ̔̂̋̾gͮͫ̓̓̕o͎͉̹͕͌͂̍͐̌̋ ͕̖͖ͯĩ͇̹̬̤͕̟ͭ̇ș̼̠̹̒̂ͭͯ̓ ͉̪͍͚̗̟͚ḟ̴̙̟̯͚̭̳̼̈́û̔ͥ̒nͥͩͭ̎̃!̫͔ ̲̯̰̰̗͔ͪ͝R̭͉̬͓̜͉ͮͬͤ̃ͯ͊a̡̩̍͊i̖̺̝̻ͦ̋̾͑͌̇ņ͍̖̟͚͓ͧ͒b̻̹͉͉̘̎̄ͬ̆͗͠ơ̮̱͉͓̗̝̯ẅ͍̱́̋ͧͭ̾̐̇s̯͓̦͓̦͇͚̎ͦ̌͐̾ ̟̞̑̉̈̎̈́l̯͉̯ͭ͡ȏ̟͙̯̞̫̳̮͒ͭ̎͆l̼͒ͩ̌̀͒l̒͒ͭ̌ͩ̎͘ͅi̝̲ͬ̋̾̉̋p̢̣͉͒̾͗ͨͯö͙̳̘̜̓ͣ͒́̿ͫͅp̵͔͎͓sͅ.̶̹͙̼̺ͧ͂̒"
+        assert(ExCoder.decode(ExCoder.encode(zalgo)) == zalgo)
+    end
+end
--- a/test/test_helper.exs
+++ b/test/test_helper.exs
@ -0,0 +1 @@
+ExUnit.start