Rewrite ExCoder for 0.15, use tail call recursion

This commit is contained in:
Mikko Ahlroth 2014-08-06 20:00:21 +03:00
parent 8d2dc493ac
commit 99307a6b3e
4 changed files with 67 additions and 45 deletions

View file

@ -4,4 +4,4 @@ about the mess of parsing HTML) and I did not want to use a full XML library
from the Erlang side.
ExCoder supports both decoding named and numbered HTML entities. Encoding will
always use the hexadecimal numbered format.
use named entities where possible and fall back to hexadecimal encoded entities.

View file

@ -4,12 +4,11 @@ defmodule ExCoder do
@max_codepoint 1114111
# Regex for matching numeric entities
@numeric_decode_regex %R/^(?:\d+|x[\da-f]+);/i
@numeric_decode_regex ~R/^(?:\d+|x[\da-f]+);/i
codec_table = [
{ "	", " " },
{"
", "
"},
{ "
", "\n" },
{ "!", "!" },
{ """, "\"" },
{ """, "\"" },
@ -2045,24 +2044,33 @@ defmodule ExCoder do
# DECODING
def decode(str), do: decode(str, "")
# Generate a decoding function for each element in the replacement table above
lc { entity, char } inlist codec_table do
def decode(<< unquote(entity), rest :: binary >>), do: unquote(char) <> decode rest
for {entity, char} <- codec_table do
def decode(<< unquote(entity), rest :: binary >>, out) do
decode rest, out <> unquote(char)
end
end
def decode(""), do: ""
def decode(<< "&#", rest :: binary >>) do
def decode(<< "&#", rest :: binary >>, out) do
if Regex.match? @numeric_decode_regex, rest do
decode rest, :numericmode
decode rest, out, :numericmode
else
"&#" <> decode rest
decode rest, out <> "&#"
end
end
def decode(str), do: String.first(str) <> decode String.slice str, 1, String.length str
def decode("", out), do: out
# Just walk through any unrecognized characters
def decode(str, out) do
decode String.slice(str, 1, String.length str), out <> String.first(str)
end
# Decode a numeric character or return the original entity if the given
# number is out of range
def decode(numeric, :numericmode, original) when is_number numeric do
if numeric <= @max_codepoint do
<< numeric :: utf8 >>
@ -2071,14 +2079,14 @@ defmodule ExCoder do
end
end
def decode(<< "x", rest :: binary >>, :numericmode) do
def decode(<< "x", rest :: binary >>, out, :numericmode) do
[numeric | rest] = strip_numeric rest
decode(binary_to_integer(numeric, 16), :numericmode, "x" <> numeric) <> decode(Enum.join rest, ";")
decode Enum.join(rest, ";"), out <> decode(String.to_integer(numeric, 16), :numericmode, "x" <> numeric)
end
def decode(str, :numericmode) do
def decode(str, out, :numericmode) do
[numeric | rest] = strip_numeric str
decode(binary_to_integer(numeric), :numericmode, numeric) <> decode(Enum.join rest, ";")
decode Enum.join(rest, ";"), out <> decode(String.to_integer(numeric), :numericmode, numeric)
end
@ -2086,18 +2094,38 @@ defmodule ExCoder do
# ENCODING
# Skip newline when encoding
def encode(<< "
", rest :: binary >>), do: "
" <> encode rest
# Remove duplicate elements from replacement table when encoding
{encode_table, _} = Enum.reduce(codec_table, {[], HashSet.new},
fn ({entity, char}, {acc, passed_chars}) ->
unless Set.member?(passed_chars, char) do
passed_chars = Set.put passed_chars, char
{acc ++ [{entity, char}], passed_chars}
else
{acc, passed_chars}
end
end
)
# Replace anything on the codec table with the named entity version
lc { entity, char } inlist codec_table do
def encode(<< unquote(char), rest :: binary >>), do: unquote(entity) <> encode rest
for {entity, char} <- encode_table do
# Except skip newlines
if char != "\n" do
def encode(<< unquote(char), rest :: binary >>, out) do
encode rest, out <> unquote(entity)
end
else
def encode(<< "\n", rest :: binary >>, out) do
encode rest, out <> "\n"
end
end
end
def encode(<< codepoint :: utf8, rest :: binary >>) do
encode(codepoint) <> encode rest
def encode(<< codepoint :: utf8, rest :: binary >>, out) do
encode rest, out <> encode(codepoint)
end
def encode("", out) do
out
end
# Encode a-z as is
@ -2106,18 +2134,16 @@ defmodule ExCoder do
end
def encode(codepoint) when is_number codepoint do
"&#x" <> integer_to_binary(codepoint, 16) <> ";"
"&#x" <> Integer.to_string(codepoint, 16) <> ";"
end
def encode("") do
""
end
def encode(str), do: encode(str, "")
defp strip_numeric(str) do
String.split(str, ";")
String.split str, ";"
end
end

View file

@ -3,8 +3,8 @@ defmodule Excoder.Mixfile do
def project do
[ app: :excoder,
version: "1.1.0",
elixir: "~> 0.11",
version: "1.2.0",
elixir: "~> 0.15",
deps: deps ]
end

View file

@ -23,6 +23,10 @@ defmodule ExcoderTest do
assert(ExCoder.decode("&#453g3;") == "&#453g3;")
end
test "Decode numeric that is too big" do
assert(ExCoder.decode("&#1114112;") == "&#1114112;")
end
test "Decode HTML" do
assert(ExCoder.decode("""
<div class='line' id='LC9'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="ss">address:</span> <span class="s2">&quot;chat.eu.freenode.net&quot;</span><span class="p">,</span></div><div class='line' id='LC10'>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<span class="ss">
@ -45,16 +49,8 @@ defmodule ExcoderTest do
assert(ExCoder.encode("foo and bar") == "foo and bar")
end
test "Encode linebreak" do
assert(ExCoder.encode("
foo
and
bar
") == "
foo
and
bar
")
test "Don't encode linebreaks" do
assert(ExCoder.encode("\nfoo\nand\nbar\n") == "\nfoo\nand\nbar\n")
end
test "Encode scandinavian characters" do