Rewrite ExCoder for 0.15, use tail call recursion
This commit is contained in:
parent
8d2dc493ac
commit
99307a6b3e
4 changed files with 67 additions and 45 deletions
|
@ -4,4 +4,4 @@ about the mess of parsing HTML) and I did not want to use a full XML library
|
|||
from the Erlang side.
|
||||
|
||||
ExCoder supports both decoding named and numbered HTML entities. Encoding will
|
||||
always use the hexadecimal numbered format.
|
||||
use named entities where possible and fall back to hexadecimal encoded entities.
|
||||
|
|
|
@ -4,12 +4,11 @@ defmodule ExCoder do
|
|||
@max_codepoint 1114111
|
||||
|
||||
# Regex for matching numeric entities
|
||||
@numeric_decode_regex %R/^(?:\d+|x[\da-f]+);/i
|
||||
@numeric_decode_regex ~R/^(?:\d+|x[\da-f]+);/i
|
||||
|
||||
codec_table = [
|
||||
{ "	", " " },
|
||||
{"
", "
|
||||
"},
|
||||
{ "
", "\n" },
|
||||
{ "!", "!" },
|
||||
{ """, "\"" },
|
||||
{ """, "\"" },
|
||||
|
@ -2045,24 +2044,33 @@ defmodule ExCoder do
|
|||
|
||||
# DECODING
|
||||
|
||||
def decode(str), do: decode(str, "")
|
||||
|
||||
# Generate a decoding function for each element in the replacement table above
|
||||
lc { entity, char } inlist codec_table do
|
||||
def decode(<< unquote(entity), rest :: binary >>), do: unquote(char) <> decode rest
|
||||
end
|
||||
|
||||
def decode(""), do: ""
|
||||
|
||||
def decode(<< "&#", rest :: binary >>) do
|
||||
if Regex.match? @numeric_decode_regex, rest do
|
||||
decode rest, :numericmode
|
||||
else
|
||||
"&#" <> decode rest
|
||||
for {entity, char} <- codec_table do
|
||||
def decode(<< unquote(entity), rest :: binary >>, out) do
|
||||
decode rest, out <> unquote(char)
|
||||
end
|
||||
end
|
||||
|
||||
def decode(str), do: String.first(str) <> decode String.slice str, 1, String.length str
|
||||
def decode(<< "&#", rest :: binary >>, out) do
|
||||
if Regex.match? @numeric_decode_regex, rest do
|
||||
decode rest, out, :numericmode
|
||||
else
|
||||
decode rest, out <> "&#"
|
||||
end
|
||||
end
|
||||
|
||||
def decode("", out), do: out
|
||||
|
||||
# Just walk through any unrecognized characters
|
||||
def decode(str, out) do
|
||||
decode String.slice(str, 1, String.length str), out <> String.first(str)
|
||||
end
|
||||
|
||||
|
||||
# Decode a numeric character or return the original entity if the given
|
||||
# number is out of range
|
||||
def decode(numeric, :numericmode, original) when is_number numeric do
|
||||
if numeric <= @max_codepoint do
|
||||
<< numeric :: utf8 >>
|
||||
|
@ -2071,14 +2079,14 @@ defmodule ExCoder do
|
|||
end
|
||||
end
|
||||
|
||||
def decode(<< "x", rest :: binary >>, :numericmode) do
|
||||
def decode(<< "x", rest :: binary >>, out, :numericmode) do
|
||||
[numeric | rest] = strip_numeric rest
|
||||
decode(binary_to_integer(numeric, 16), :numericmode, "x" <> numeric) <> decode(Enum.join rest, ";")
|
||||
decode Enum.join(rest, ";"), out <> decode(String.to_integer(numeric, 16), :numericmode, "x" <> numeric)
|
||||
end
|
||||
|
||||
def decode(str, :numericmode) do
|
||||
def decode(str, out, :numericmode) do
|
||||
[numeric | rest] = strip_numeric str
|
||||
decode(binary_to_integer(numeric), :numericmode, numeric) <> decode(Enum.join rest, ";")
|
||||
decode Enum.join(rest, ";"), out <> decode(String.to_integer(numeric), :numericmode, numeric)
|
||||
end
|
||||
|
||||
|
||||
|
@ -2086,18 +2094,38 @@ defmodule ExCoder do
|
|||
|
||||
# ENCODING
|
||||
|
||||
# Skip newline when encoding
|
||||
def encode(<< "
|
||||
", rest :: binary >>), do: "
|
||||
" <> encode rest
|
||||
# Remove duplicate elements from replacement table when encoding
|
||||
{encode_table, _} = Enum.reduce(codec_table, {[], HashSet.new},
|
||||
fn ({entity, char}, {acc, passed_chars}) ->
|
||||
unless Set.member?(passed_chars, char) do
|
||||
passed_chars = Set.put passed_chars, char
|
||||
{acc ++ [{entity, char}], passed_chars}
|
||||
else
|
||||
{acc, passed_chars}
|
||||
end
|
||||
end
|
||||
)
|
||||
|
||||
# Replace anything on the codec table with the named entity version
|
||||
lc { entity, char } inlist codec_table do
|
||||
def encode(<< unquote(char), rest :: binary >>), do: unquote(entity) <> encode rest
|
||||
for {entity, char} <- encode_table do
|
||||
# Except skip newlines
|
||||
if char != "\n" do
|
||||
def encode(<< unquote(char), rest :: binary >>, out) do
|
||||
encode rest, out <> unquote(entity)
|
||||
end
|
||||
else
|
||||
def encode(<< "\n", rest :: binary >>, out) do
|
||||
encode rest, out <> "\n"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def encode(<< codepoint :: utf8, rest :: binary >>) do
|
||||
encode(codepoint) <> encode rest
|
||||
def encode(<< codepoint :: utf8, rest :: binary >>, out) do
|
||||
encode rest, out <> encode(codepoint)
|
||||
end
|
||||
|
||||
def encode("", out) do
|
||||
out
|
||||
end
|
||||
|
||||
# Encode a-z as is
|
||||
|
@ -2106,18 +2134,16 @@ defmodule ExCoder do
|
|||
end
|
||||
|
||||
def encode(codepoint) when is_number codepoint do
|
||||
"&#x" <> integer_to_binary(codepoint, 16) <> ";"
|
||||
"&#x" <> Integer.to_string(codepoint, 16) <> ";"
|
||||
end
|
||||
|
||||
def encode("") do
|
||||
""
|
||||
end
|
||||
def encode(str), do: encode(str, "")
|
||||
|
||||
|
||||
|
||||
|
||||
defp strip_numeric(str) do
|
||||
String.split(str, ";")
|
||||
String.split str, ";"
|
||||
end
|
||||
|
||||
end
|
||||
|
|
4
mix.exs
4
mix.exs
|
@ -3,8 +3,8 @@ defmodule Excoder.Mixfile do
|
|||
|
||||
def project do
|
||||
[ app: :excoder,
|
||||
version: "1.1.0",
|
||||
elixir: "~> 0.11",
|
||||
version: "1.2.0",
|
||||
elixir: "~> 0.15",
|
||||
deps: deps ]
|
||||
end
|
||||
|
||||
|
|
|
@ -23,6 +23,10 @@ defmodule ExcoderTest do
|
|||
assert(ExCoder.decode("Džg3;") == "Džg3;")
|
||||
end
|
||||
|
||||
test "Decode numeric that is too big" do
|
||||
assert(ExCoder.decode("�") == "�")
|
||||
end
|
||||
|
||||
test "Decode HTML" do
|
||||
assert(ExCoder.decode("""
|
||||
<div class='line' id='LC9'> <span class="ss">address:</span> <span class="s2">"chat.eu.freenode.net"</span><span class="p">,</span></div><div class='line' id='LC10'> <span class="ss">
|
||||
|
@ -45,16 +49,8 @@ defmodule ExcoderTest do
|
|||
assert(ExCoder.encode("foo and bar") == "foo and bar")
|
||||
end
|
||||
|
||||
test "Encode linebreak" do
|
||||
assert(ExCoder.encode("
|
||||
foo
|
||||
and
|
||||
bar
|
||||
") == "
|
||||
foo
|
||||
and
|
||||
bar
|
||||
")
|
||||
test "Don't encode linebreaks" do
|
||||
assert(ExCoder.encode("\nfoo\nand\nbar\n") == "\nfoo\nand\nbar\n")
|
||||
end
|
||||
|
||||
test "Encode scandinavian characters" do
|
||||
|
|
Loading…
Reference in a new issue