From 99307a6b3e09f23443518191ec2f9a022bc51ca3 Mon Sep 17 00:00:00 2001
From: Mikko Ahlroth <mikko.ahlroth@gmail.com>
Date: Wed, 6 Aug 2014 20:00:21 +0300
Subject: [PATCH] =?UTF-8?q?Rewrite=20ExCoder=20for=200.15=EF=9C=88,=20use?=
 =?UTF-8?q?=20tail=20call=20recursion?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md             |  2 +-
 lib/excoder.ex        | 90 ++++++++++++++++++++++++++++---------------
 mix.exs               |  4 +-
 test/excoder_test.exs | 16 +++-----
 4 files changed, 67 insertions(+), 45 deletions(-)

diff --git a/README.md b/README.md
index 663402d..db0c75b 100644
--- a/README.md
+++ b/README.md
@@ -4,4 +4,4 @@ about the mess of parsing HTML) and I did not want to use a full XML library
 from the Erlang side.
 
 ExCoder supports both decoding named and numbered HTML entities. Encoding will
-always use the hexadecimal numbered format.
+use named entities where possible and fall back to hexadecimal encoded entities.
diff --git a/lib/excoder.ex b/lib/excoder.ex
index 26ae2bb..68b82cd 100644
--- a/lib/excoder.ex
+++ b/lib/excoder.ex
@@ -4,12 +4,11 @@ defmodule ExCoder do
     @max_codepoint 1114111
 
     # Regex for matching numeric entities
-    @numeric_decode_regex %R/^(?:\d+|x[\da-f]+);/i
+    @numeric_decode_regex ~R/^(?:\d+|x[\da-f]+);/i
 
     codec_table = [
         { "&Tab;", "  " },
-        {"&NewLine;", "
-"},
+        { "&NewLine;", "\n" },
         { "&excl;", "!" },
         { "&quot;", "\"" },
         { "&QUOT;", "\"" },
@@ -2045,24 +2044,33 @@ defmodule ExCoder do
 
     # DECODING
 
+    def decode(str), do: decode(str, "")
+
     # Generate a decoding function for each element in the replacement table above
-    lc { entity, char } inlist codec_table do
-        def decode(<< unquote(entity), rest :: binary >>), do: unquote(char) <> decode rest
-    end
-
-    def decode(""), do: ""
-
-    def decode(<< "&#", rest :: binary >>) do
-        if Regex.match? @numeric_decode_regex, rest do
-            decode rest, :numericmode
-        else
-            "&#" <> decode rest
+    for {entity, char} <- codec_table do
+        def decode(<< unquote(entity), rest :: binary >>, out) do
+            decode rest, out <> unquote(char)
         end
     end
 
-    def decode(str), do: String.first(str) <> decode String.slice str, 1, String.length str
+    def decode(<< "&#", rest :: binary >>, out) do
+        if Regex.match? @numeric_decode_regex, rest do
+            decode rest, out, :numericmode
+        else
+            decode rest, out <> "&#"
+        end
+    end
+
+    def decode("", out), do: out
+
+    # Just walk through any unrecognized characters
+    def decode(str, out) do
+        decode String.slice(str, 1, String.length str), out <> String.first(str)
+    end
 
 
+    # Decode a numeric character or return the original entity if the given
+    # number is out of range
     def decode(numeric, :numericmode, original) when is_number numeric do
         if numeric <= @max_codepoint do
             << numeric :: utf8 >>
@@ -2071,14 +2079,14 @@ defmodule ExCoder do
         end
     end
 
-    def decode(<< "x", rest :: binary >>, :numericmode) do
+    def decode(<< "x", rest :: binary >>, out, :numericmode) do
         [numeric | rest] = strip_numeric rest
-        decode(binary_to_integer(numeric, 16), :numericmode, "x" <> numeric) <> decode(Enum.join rest, ";")
+        decode Enum.join(rest, ";"), out <> decode(String.to_integer(numeric, 16), :numericmode, "x" <> numeric)
     end
 
-    def decode(str, :numericmode) do
+    def decode(str, out, :numericmode) do
         [numeric | rest] = strip_numeric str
-        decode(binary_to_integer(numeric), :numericmode, numeric) <> decode(Enum.join rest, ";")
+        decode Enum.join(rest, ";"), out <> decode(String.to_integer(numeric), :numericmode, numeric)
     end
 
 
@@ -2086,18 +2094,38 @@ defmodule ExCoder do
 
     # ENCODING
 
-    # Skip newline when encoding
-    def encode(<< "
-", rest :: binary >>), do: "
-" <> encode rest 
+    # Remove duplicate elements from replacement table when encoding
+    {encode_table, _} = Enum.reduce(codec_table, {[], HashSet.new},
+        fn ({entity, char}, {acc, passed_chars}) ->
+            unless Set.member?(passed_chars, char) do
+                passed_chars = Set.put passed_chars, char
+                {acc ++ [{entity, char}], passed_chars}
+            else
+                {acc, passed_chars}
+            end
+        end
+    )
 
     # Replace anything on the codec table with the named entity version
-    lc { entity, char } inlist codec_table do
-        def encode(<< unquote(char), rest :: binary >>), do: unquote(entity) <> encode rest
+    for {entity, char} <- encode_table do
+        # Except skip newlines
+        if char != "\n" do
+            def encode(<< unquote(char), rest :: binary >>, out) do
+                encode rest, out <> unquote(entity)
+            end
+        else
+            def encode(<< "\n", rest :: binary >>, out) do
+                encode rest, out <> "\n"
+            end
+        end
     end
 
-    def encode(<< codepoint :: utf8, rest :: binary >>) do
-        encode(codepoint) <> encode rest
+    def encode(<< codepoint :: utf8, rest :: binary >>, out) do
+        encode rest, out <> encode(codepoint)
+    end
+
+    def encode("", out) do
+        out
     end
 
     # Encode a-z as is
@@ -2106,18 +2134,16 @@ defmodule ExCoder do
     end
 
     def encode(codepoint) when is_number codepoint do
-        "&#x" <> integer_to_binary(codepoint, 16) <> ";"
+        "&#x" <> Integer.to_string(codepoint, 16) <> ";"
     end
 
-    def encode("") do
-        ""
-    end
+    def encode(str), do: encode(str, "")
 
 
 
 
     defp strip_numeric(str) do
-        String.split(str, ";")
+        String.split str, ";"
     end
 
 end
diff --git a/mix.exs b/mix.exs
index bbc9b8d..e6da5a7 100644
--- a/mix.exs
+++ b/mix.exs
@@ -3,8 +3,8 @@ defmodule Excoder.Mixfile do
 
   def project do
     [ app: :excoder,
-      version: "1.1.0",
-      elixir: "~> 0.11",
+      version: "1.2.0",
+      elixir: "~> 0.15",
       deps: deps ]
   end
 
diff --git a/test/excoder_test.exs b/test/excoder_test.exs
index 34c9754..e21303f 100644
--- a/test/excoder_test.exs
+++ b/test/excoder_test.exs
@@ -23,6 +23,10 @@ defmodule ExcoderTest do
         assert(ExCoder.decode("&#453g3;") == "&#453g3;")
     end
 
+    test "Decode numeric that is too big" do
+        assert(ExCoder.decode("&#1114112;") == "&#1114112;")
+    end
+
     test "Decode HTML" do
         assert(ExCoder.decode("""
 <div class='line' id='LC9'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="ss">address:</span> <span class="s2">&quot;chat.eu.freenode.net&quot;</span><span class="p">,</span></div><div class='line' id='LC10'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="ss">
@@ -45,16 +49,8 @@ defmodule ExcoderTest do
         assert(ExCoder.encode("foo and bar") == "foo and bar")
     end
 
-    test "Encode linebreak" do
-        assert(ExCoder.encode("
-foo
-and
-bar
-") == "
-foo
-and
-bar
-")
+    test "Don't encode linebreaks" do
+        assert(ExCoder.encode("\nfoo\nand\nbar\n") == "\nfoo\nand\nbar\n")
     end
 
     test "Encode scandinavian characters" do