From 99307a6b3e09f23443518191ec2f9a022bc51ca3 Mon Sep 17 00:00:00 2001 From: Mikko Ahlroth Date: Wed, 6 Aug 2014 20:00:21 +0300 Subject: [PATCH] =?UTF-8?q?Rewrite=20ExCoder=20for=200.15=EF=9C=88,=20use?= =?UTF-8?q?=20tail=20call=20recursion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- lib/excoder.ex | 90 ++++++++++++++++++++++++++++--------------- mix.exs | 4 +- test/excoder_test.exs | 16 +++----- 4 files changed, 67 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index 663402d..db0c75b 100644 --- a/README.md +++ b/README.md @@ -4,4 +4,4 @@ about the mess of parsing HTML) and I did not want to use a full XML library from the Erlang side. ExCoder supports both decoding named and numbered HTML entities. Encoding will -always use the hexadecimal numbered format. +use named entities where possible and fall back to hexadecimal encoded entities. diff --git a/lib/excoder.ex b/lib/excoder.ex index 26ae2bb..68b82cd 100644 --- a/lib/excoder.ex +++ b/lib/excoder.ex @@ -4,12 +4,11 @@ defmodule ExCoder do @max_codepoint 1114111 # Regex for matching numeric entities - @numeric_decode_regex %R/^(?:\d+|x[\da-f]+);/i + @numeric_decode_regex ~R/^(?:\d+|x[\da-f]+);/i codec_table = [ { " ", " " }, - {" ", " -"}, + { " ", "\n" }, { "!", "!" }, { """, "\"" }, { """, "\"" }, @@ -2045,24 +2044,33 @@ defmodule ExCoder do # DECODING + def decode(str), do: decode(str, "") + # Generate a decoding function for each element in the replacement table above - lc { entity, char } inlist codec_table do - def decode(<< unquote(entity), rest :: binary >>), do: unquote(char) <> decode rest - end - - def decode(""), do: "" - - def decode(<< "&#", rest :: binary >>) do - if Regex.match? @numeric_decode_regex, rest do - decode rest, :numericmode - else - "&#" <> decode rest + for {entity, char} <- codec_table do + def decode(<< unquote(entity), rest :: binary >>, out) do + decode rest, out <> unquote(char) end end - def decode(str), do: String.first(str) <> decode String.slice str, 1, String.length str + def decode(<< "&#", rest :: binary >>, out) do + if Regex.match? @numeric_decode_regex, rest do + decode rest, out, :numericmode + else + decode rest, out <> "&#" + end + end + + def decode("", out), do: out + + # Just walk through any unrecognized characters + def decode(str, out) do + decode String.slice(str, 1, String.length str), out <> String.first(str) + end + # Decode a numeric character or return the original entity if the given + # number is out of range def decode(numeric, :numericmode, original) when is_number numeric do if numeric <= @max_codepoint do << numeric :: utf8 >> @@ -2071,14 +2079,14 @@ defmodule ExCoder do end end - def decode(<< "x", rest :: binary >>, :numericmode) do + def decode(<< "x", rest :: binary >>, out, :numericmode) do [numeric | rest] = strip_numeric rest - decode(binary_to_integer(numeric, 16), :numericmode, "x" <> numeric) <> decode(Enum.join rest, ";") + decode Enum.join(rest, ";"), out <> decode(String.to_integer(numeric, 16), :numericmode, "x" <> numeric) end - def decode(str, :numericmode) do + def decode(str, out, :numericmode) do [numeric | rest] = strip_numeric str - decode(binary_to_integer(numeric), :numericmode, numeric) <> decode(Enum.join rest, ";") + decode Enum.join(rest, ";"), out <> decode(String.to_integer(numeric), :numericmode, numeric) end @@ -2086,18 +2094,38 @@ defmodule ExCoder do # ENCODING - # Skip newline when encoding - def encode(<< " -", rest :: binary >>), do: " -" <> encode rest + # Remove duplicate elements from replacement table when encoding + {encode_table, _} = Enum.reduce(codec_table, {[], HashSet.new}, + fn ({entity, char}, {acc, passed_chars}) -> + unless Set.member?(passed_chars, char) do + passed_chars = Set.put passed_chars, char + {acc ++ [{entity, char}], passed_chars} + else + {acc, passed_chars} + end + end + ) # Replace anything on the codec table with the named entity version - lc { entity, char } inlist codec_table do - def encode(<< unquote(char), rest :: binary >>), do: unquote(entity) <> encode rest + for {entity, char} <- encode_table do + # Except skip newlines + if char != "\n" do + def encode(<< unquote(char), rest :: binary >>, out) do + encode rest, out <> unquote(entity) + end + else + def encode(<< "\n", rest :: binary >>, out) do + encode rest, out <> "\n" + end + end end - def encode(<< codepoint :: utf8, rest :: binary >>) do - encode(codepoint) <> encode rest + def encode(<< codepoint :: utf8, rest :: binary >>, out) do + encode rest, out <> encode(codepoint) + end + + def encode("", out) do + out end # Encode a-z as is @@ -2106,18 +2134,16 @@ defmodule ExCoder do end def encode(codepoint) when is_number codepoint do - "&#x" <> integer_to_binary(codepoint, 16) <> ";" + "&#x" <> Integer.to_string(codepoint, 16) <> ";" end - def encode("") do - "" - end + def encode(str), do: encode(str, "") defp strip_numeric(str) do - String.split(str, ";") + String.split str, ";" end end diff --git a/mix.exs b/mix.exs index bbc9b8d..e6da5a7 100644 --- a/mix.exs +++ b/mix.exs @@ -3,8 +3,8 @@ defmodule Excoder.Mixfile do def project do [ app: :excoder, - version: "1.1.0", - elixir: "~> 0.11", + version: "1.2.0", + elixir: "~> 0.15", deps: deps ] end diff --git a/test/excoder_test.exs b/test/excoder_test.exs index 34c9754..e21303f 100644 --- a/test/excoder_test.exs +++ b/test/excoder_test.exs @@ -23,6 +23,10 @@ defmodule ExcoderTest do assert(ExCoder.decode("Džg3;") == "Džg3;") end + test "Decode numeric that is too big" do + assert(ExCoder.decode("�") == "�") + end + test "Decode HTML" do assert(ExCoder.decode("""
      address: "chat.eu.freenode.net",
       @@ -45,16 +49,8 @@ defmodule ExcoderTest do assert(ExCoder.encode("foo and bar") == "foo and bar") end - test "Encode linebreak" do - assert(ExCoder.encode(" -foo -and -bar -") == " -foo -and -bar -") + test "Don't encode linebreaks" do + assert(ExCoder.encode("\nfoo\nand\nbar\n") == "\nfoo\nand\nbar\n") end test "Encode scandinavian characters" do