Rewrite ExCoder for 0.15, use tail call recursion
This commit is contained in:
parent
8d2dc493ac
commit
99307a6b3e
4 changed files with 67 additions and 45 deletions
|
@ -4,4 +4,4 @@ about the mess of parsing HTML) and I did not want to use a full XML library
|
||||||
from the Erlang side.
|
from the Erlang side.
|
||||||
|
|
||||||
ExCoder supports both decoding named and numbered HTML entities. Encoding will
|
ExCoder supports both decoding named and numbered HTML entities. Encoding will
|
||||||
always use the hexadecimal numbered format.
|
use named entities where possible and fall back to hexadecimal encoded entities.
|
||||||
|
|
|
@ -4,12 +4,11 @@ defmodule ExCoder do
|
||||||
@max_codepoint 1114111
|
@max_codepoint 1114111
|
||||||
|
|
||||||
# Regex for matching numeric entities
|
# Regex for matching numeric entities
|
||||||
@numeric_decode_regex %R/^(?:\d+|x[\da-f]+);/i
|
@numeric_decode_regex ~R/^(?:\d+|x[\da-f]+);/i
|
||||||
|
|
||||||
codec_table = [
|
codec_table = [
|
||||||
{ "	", " " },
|
{ "	", " " },
|
||||||
{"
", "
|
{ "
", "\n" },
|
||||||
"},
|
|
||||||
{ "!", "!" },
|
{ "!", "!" },
|
||||||
{ """, "\"" },
|
{ """, "\"" },
|
||||||
{ """, "\"" },
|
{ """, "\"" },
|
||||||
|
@ -2045,24 +2044,33 @@ defmodule ExCoder do
|
||||||
|
|
||||||
# DECODING
|
# DECODING
|
||||||
|
|
||||||
|
def decode(str), do: decode(str, "")
|
||||||
|
|
||||||
# Generate a decoding function for each element in the replacement table above
|
# Generate a decoding function for each element in the replacement table above
|
||||||
lc { entity, char } inlist codec_table do
|
for {entity, char} <- codec_table do
|
||||||
def decode(<< unquote(entity), rest :: binary >>), do: unquote(char) <> decode rest
|
def decode(<< unquote(entity), rest :: binary >>, out) do
|
||||||
|
decode rest, out <> unquote(char)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def decode(""), do: ""
|
def decode(<< "&#", rest :: binary >>, out) do
|
||||||
|
|
||||||
def decode(<< "&#", rest :: binary >>) do
|
|
||||||
if Regex.match? @numeric_decode_regex, rest do
|
if Regex.match? @numeric_decode_regex, rest do
|
||||||
decode rest, :numericmode
|
decode rest, out, :numericmode
|
||||||
else
|
else
|
||||||
"&#" <> decode rest
|
decode rest, out <> "&#"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def decode(str), do: String.first(str) <> decode String.slice str, 1, String.length str
|
def decode("", out), do: out
|
||||||
|
|
||||||
|
# Just walk through any unrecognized characters
|
||||||
|
def decode(str, out) do
|
||||||
|
decode String.slice(str, 1, String.length str), out <> String.first(str)
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# Decode a numeric character or return the original entity if the given
|
||||||
|
# number is out of range
|
||||||
def decode(numeric, :numericmode, original) when is_number numeric do
|
def decode(numeric, :numericmode, original) when is_number numeric do
|
||||||
if numeric <= @max_codepoint do
|
if numeric <= @max_codepoint do
|
||||||
<< numeric :: utf8 >>
|
<< numeric :: utf8 >>
|
||||||
|
@ -2071,14 +2079,14 @@ defmodule ExCoder do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def decode(<< "x", rest :: binary >>, :numericmode) do
|
def decode(<< "x", rest :: binary >>, out, :numericmode) do
|
||||||
[numeric | rest] = strip_numeric rest
|
[numeric | rest] = strip_numeric rest
|
||||||
decode(binary_to_integer(numeric, 16), :numericmode, "x" <> numeric) <> decode(Enum.join rest, ";")
|
decode Enum.join(rest, ";"), out <> decode(String.to_integer(numeric, 16), :numericmode, "x" <> numeric)
|
||||||
end
|
end
|
||||||
|
|
||||||
def decode(str, :numericmode) do
|
def decode(str, out, :numericmode) do
|
||||||
[numeric | rest] = strip_numeric str
|
[numeric | rest] = strip_numeric str
|
||||||
decode(binary_to_integer(numeric), :numericmode, numeric) <> decode(Enum.join rest, ";")
|
decode Enum.join(rest, ";"), out <> decode(String.to_integer(numeric), :numericmode, numeric)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
@ -2086,18 +2094,38 @@ defmodule ExCoder do
|
||||||
|
|
||||||
# ENCODING
|
# ENCODING
|
||||||
|
|
||||||
# Skip newline when encoding
|
# Remove duplicate elements from replacement table when encoding
|
||||||
def encode(<< "
|
{encode_table, _} = Enum.reduce(codec_table, {[], HashSet.new},
|
||||||
", rest :: binary >>), do: "
|
fn ({entity, char}, {acc, passed_chars}) ->
|
||||||
" <> encode rest
|
unless Set.member?(passed_chars, char) do
|
||||||
|
passed_chars = Set.put passed_chars, char
|
||||||
|
{acc ++ [{entity, char}], passed_chars}
|
||||||
|
else
|
||||||
|
{acc, passed_chars}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
)
|
||||||
|
|
||||||
# Replace anything on the codec table with the named entity version
|
# Replace anything on the codec table with the named entity version
|
||||||
lc { entity, char } inlist codec_table do
|
for {entity, char} <- encode_table do
|
||||||
def encode(<< unquote(char), rest :: binary >>), do: unquote(entity) <> encode rest
|
# Except skip newlines
|
||||||
|
if char != "\n" do
|
||||||
|
def encode(<< unquote(char), rest :: binary >>, out) do
|
||||||
|
encode rest, out <> unquote(entity)
|
||||||
|
end
|
||||||
|
else
|
||||||
|
def encode(<< "\n", rest :: binary >>, out) do
|
||||||
|
encode rest, out <> "\n"
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def encode(<< codepoint :: utf8, rest :: binary >>) do
|
def encode(<< codepoint :: utf8, rest :: binary >>, out) do
|
||||||
encode(codepoint) <> encode rest
|
encode rest, out <> encode(codepoint)
|
||||||
|
end
|
||||||
|
|
||||||
|
def encode("", out) do
|
||||||
|
out
|
||||||
end
|
end
|
||||||
|
|
||||||
# Encode a-z as is
|
# Encode a-z as is
|
||||||
|
@ -2106,18 +2134,16 @@ defmodule ExCoder do
|
||||||
end
|
end
|
||||||
|
|
||||||
def encode(codepoint) when is_number codepoint do
|
def encode(codepoint) when is_number codepoint do
|
||||||
"&#x" <> integer_to_binary(codepoint, 16) <> ";"
|
"&#x" <> Integer.to_string(codepoint, 16) <> ";"
|
||||||
end
|
end
|
||||||
|
|
||||||
def encode("") do
|
def encode(str), do: encode(str, "")
|
||||||
""
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
defp strip_numeric(str) do
|
defp strip_numeric(str) do
|
||||||
String.split(str, ";")
|
String.split str, ";"
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
4
mix.exs
4
mix.exs
|
@ -3,8 +3,8 @@ defmodule Excoder.Mixfile do
|
||||||
|
|
||||||
def project do
|
def project do
|
||||||
[ app: :excoder,
|
[ app: :excoder,
|
||||||
version: "1.1.0",
|
version: "1.2.0",
|
||||||
elixir: "~> 0.11",
|
elixir: "~> 0.15",
|
||||||
deps: deps ]
|
deps: deps ]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,10 @@ defmodule ExcoderTest do
|
||||||
assert(ExCoder.decode("Džg3;") == "Džg3;")
|
assert(ExCoder.decode("Džg3;") == "Džg3;")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
test "Decode numeric that is too big" do
|
||||||
|
assert(ExCoder.decode("�") == "�")
|
||||||
|
end
|
||||||
|
|
||||||
test "Decode HTML" do
|
test "Decode HTML" do
|
||||||
assert(ExCoder.decode("""
|
assert(ExCoder.decode("""
|
||||||
<div class='line' id='LC9'> <span class="ss">address:</span> <span class="s2">"chat.eu.freenode.net"</span><span class="p">,</span></div><div class='line' id='LC10'> <span class="ss">
|
<div class='line' id='LC9'> <span class="ss">address:</span> <span class="s2">"chat.eu.freenode.net"</span><span class="p">,</span></div><div class='line' id='LC10'> <span class="ss">
|
||||||
|
@ -45,16 +49,8 @@ defmodule ExcoderTest do
|
||||||
assert(ExCoder.encode("foo and bar") == "foo and bar")
|
assert(ExCoder.encode("foo and bar") == "foo and bar")
|
||||||
end
|
end
|
||||||
|
|
||||||
test "Encode linebreak" do
|
test "Don't encode linebreaks" do
|
||||||
assert(ExCoder.encode("
|
assert(ExCoder.encode("\nfoo\nand\nbar\n") == "\nfoo\nand\nbar\n")
|
||||||
foo
|
|
||||||
and
|
|
||||||
bar
|
|
||||||
") == "
|
|
||||||
foo
|
|
||||||
and
|
|
||||||
bar
|
|
||||||
")
|
|
||||||
end
|
end
|
||||||
|
|
||||||
test "Encode scandinavian characters" do
|
test "Encode scandinavian characters" do
|
||||||
|
|
Loading…
Reference in a new issue