Rewrite ExCoder for 0.15, use tail call recursion

This commit is contained in:
Mikko Ahlroth 2014-08-06 20:00:21 +03:00
parent 8d2dc493ac
commit 99307a6b3e
4 changed files with 67 additions and 45 deletions

View file

@ -4,4 +4,4 @@ about the mess of parsing HTML) and I did not want to use a full XML library
from the Erlang side. from the Erlang side.
ExCoder supports both decoding named and numbered HTML entities. Encoding will ExCoder supports both decoding named and numbered HTML entities. Encoding will
always use the hexadecimal numbered format. use named entities where possible and fall back to hexadecimal encoded entities.

View file

@ -4,12 +4,11 @@ defmodule ExCoder do
@max_codepoint 1114111 @max_codepoint 1114111
# Regex for matching numeric entities # Regex for matching numeric entities
@numeric_decode_regex %R/^(?:\d+|x[\da-f]+);/i @numeric_decode_regex ~R/^(?:\d+|x[\da-f]+);/i
codec_table = [ codec_table = [
{ "	", " " }, { "	", " " },
{"
", " { "
", "\n" },
"},
{ "!", "!" }, { "!", "!" },
{ """, "\"" }, { """, "\"" },
{ """, "\"" }, { """, "\"" },
@ -2045,24 +2044,33 @@ defmodule ExCoder do
# DECODING # DECODING
def decode(str), do: decode(str, "")
# Generate a decoding function for each element in the replacement table above # Generate a decoding function for each element in the replacement table above
lc { entity, char } inlist codec_table do for {entity, char} <- codec_table do
def decode(<< unquote(entity), rest :: binary >>), do: unquote(char) <> decode rest def decode(<< unquote(entity), rest :: binary >>, out) do
end decode rest, out <> unquote(char)
def decode(""), do: ""
def decode(<< "&#", rest :: binary >>) do
if Regex.match? @numeric_decode_regex, rest do
decode rest, :numericmode
else
"&#" <> decode rest
end end
end end
def decode(str), do: String.first(str) <> decode String.slice str, 1, String.length str def decode(<< "&#", rest :: binary >>, out) do
if Regex.match? @numeric_decode_regex, rest do
decode rest, out, :numericmode
else
decode rest, out <> "&#"
end
end
def decode("", out), do: out
# Just walk through any unrecognized characters
def decode(str, out) do
decode String.slice(str, 1, String.length str), out <> String.first(str)
end
# Decode a numeric character or return the original entity if the given
# number is out of range
def decode(numeric, :numericmode, original) when is_number numeric do def decode(numeric, :numericmode, original) when is_number numeric do
if numeric <= @max_codepoint do if numeric <= @max_codepoint do
<< numeric :: utf8 >> << numeric :: utf8 >>
@ -2071,14 +2079,14 @@ defmodule ExCoder do
end end
end end
def decode(<< "x", rest :: binary >>, :numericmode) do def decode(<< "x", rest :: binary >>, out, :numericmode) do
[numeric | rest] = strip_numeric rest [numeric | rest] = strip_numeric rest
decode(binary_to_integer(numeric, 16), :numericmode, "x" <> numeric) <> decode(Enum.join rest, ";") decode Enum.join(rest, ";"), out <> decode(String.to_integer(numeric, 16), :numericmode, "x" <> numeric)
end end
def decode(str, :numericmode) do def decode(str, out, :numericmode) do
[numeric | rest] = strip_numeric str [numeric | rest] = strip_numeric str
decode(binary_to_integer(numeric), :numericmode, numeric) <> decode(Enum.join rest, ";") decode Enum.join(rest, ";"), out <> decode(String.to_integer(numeric), :numericmode, numeric)
end end
@ -2086,18 +2094,38 @@ defmodule ExCoder do
# ENCODING # ENCODING
# Skip newline when encoding # Remove duplicate elements from replacement table when encoding
def encode(<< " {encode_table, _} = Enum.reduce(codec_table, {[], HashSet.new},
", rest :: binary >>), do: " fn ({entity, char}, {acc, passed_chars}) ->
" <> encode rest unless Set.member?(passed_chars, char) do
passed_chars = Set.put passed_chars, char
{acc ++ [{entity, char}], passed_chars}
else
{acc, passed_chars}
end
end
)
# Replace anything on the codec table with the named entity version # Replace anything on the codec table with the named entity version
lc { entity, char } inlist codec_table do for {entity, char} <- encode_table do
def encode(<< unquote(char), rest :: binary >>), do: unquote(entity) <> encode rest # Except skip newlines
if char != "\n" do
def encode(<< unquote(char), rest :: binary >>, out) do
encode rest, out <> unquote(entity)
end
else
def encode(<< "\n", rest :: binary >>, out) do
encode rest, out <> "\n"
end
end
end end
def encode(<< codepoint :: utf8, rest :: binary >>) do def encode(<< codepoint :: utf8, rest :: binary >>, out) do
encode(codepoint) <> encode rest encode rest, out <> encode(codepoint)
end
def encode("", out) do
out
end end
# Encode a-z as is # Encode a-z as is
@ -2106,18 +2134,16 @@ defmodule ExCoder do
end end
def encode(codepoint) when is_number codepoint do def encode(codepoint) when is_number codepoint do
"&#x" <> integer_to_binary(codepoint, 16) <> ";" "&#x" <> Integer.to_string(codepoint, 16) <> ";"
end end
def encode("") do def encode(str), do: encode(str, "")
""
end
defp strip_numeric(str) do defp strip_numeric(str) do
String.split(str, ";") String.split str, ";"
end end
end end

View file

@ -3,8 +3,8 @@ defmodule Excoder.Mixfile do
def project do def project do
[ app: :excoder, [ app: :excoder,
version: "1.1.0", version: "1.2.0",
elixir: "~> 0.11", elixir: "~> 0.15",
deps: deps ] deps: deps ]
end end

View file

@ -23,6 +23,10 @@ defmodule ExcoderTest do
assert(ExCoder.decode("&#453g3;") == "&#453g3;") assert(ExCoder.decode("&#453g3;") == "&#453g3;")
end end
test "Decode numeric that is too big" do
assert(ExCoder.decode("&#1114112;") == "&#1114112;")
end
test "Decode HTML" do test "Decode HTML" do
assert(ExCoder.decode(""" assert(ExCoder.decode("""
<div class='line' id='LC9'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="ss">address:</span> <span class="s2">&quot;chat.eu.freenode.net&quot;</span><span class="p">,</span></div><div class='line' id='LC10'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="ss"> <div class='line' id='LC9'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="ss">address:</span> <span class="s2">&quot;chat.eu.freenode.net&quot;</span><span class="p">,</span></div><div class='line' id='LC10'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="ss">
@ -45,16 +49,8 @@ defmodule ExcoderTest do
assert(ExCoder.encode("foo and bar") == "foo and bar") assert(ExCoder.encode("foo and bar") == "foo and bar")
end end
test "Encode linebreak" do test "Don't encode linebreaks" do
assert(ExCoder.encode(" assert(ExCoder.encode("\nfoo\nand\nbar\n") == "\nfoo\nand\nbar\n")
foo
and
bar
") == "
foo
and
bar
")
end end
test "Encode scandinavian characters" do test "Encode scandinavian characters" do