diff --git a/lib/excoder.ex b/lib/excoder.ex index 8b91659..26ae2bb 100644 --- a/lib/excoder.ex +++ b/lib/excoder.ex @@ -3,7 +3,10 @@ defmodule ExCoder do # Change this in case the definition of UTF-8 changes for some reason :) @max_codepoint 1114111 - decode_table = [ + # Regex for matching numeric entities + @numeric_decode_regex %R/^(?:\d+|x[\da-f]+);/i + + codec_table = [ { " ", " " }, {" ", " "}, @@ -2038,15 +2041,19 @@ defmodule ExCoder do { "𝕫", "𝕫" } ] - lc { entity, value } inlist decode_table do - def decode(<< unquote(entity), rest :: binary >>), do: unquote(value) <> decode rest + + + # DECODING + + # Generate a decoding function for each element in the replacement table above + lc { entity, char } inlist codec_table do + def decode(<< unquote(entity), rest :: binary >>), do: unquote(char) <> decode rest end def decode(""), do: "" def decode(<< "&#", rest :: binary >>) do - r = %R/^(?:\d+|x[\da-f]+);/i - if Regex.match? r, rest do + if Regex.match? @numeric_decode_regex, rest do decode rest, :numericmode else "&#" <> decode rest @@ -2077,22 +2084,35 @@ defmodule ExCoder do - def encode(str) when is_binary str do - String.codepoints(str) |> encode + # ENCODING + + # Skip newline when encoding + def encode(<< " +", rest :: binary >>), do: " +" <> encode rest + + # Replace anything on the codec table with the named entity version + lc { entity, char } inlist codec_table do + def encode(<< unquote(char), rest :: binary >>), do: unquote(entity) <> encode rest end - def encode([]) do + def encode(<< codepoint :: utf8, rest :: binary >>) do + encode(codepoint) <> encode rest + end + + # Encode a-z as is + def encode(codepoint) when is_number(codepoint) and codepoint >= 32 and codepoint <= 126 do + << codepoint :: utf8 >> + end + + def encode(codepoint) when is_number codepoint do + "&#x" <> integer_to_binary(codepoint, 16) <> ";" + end + + def encode("") do "" end - def encode([<< codepoint :: utf8 >> | rest]) when codepoint >= 32 and codepoint <= 126 do - << codepoint :: utf8 >> <> encode rest - end - - def encode([<< codepoint :: utf8 >> | rest]) do - "&#x" <> integer_to_binary(codepoint, 16) <> ";" <> encode rest - end - diff --git a/mix.exs b/mix.exs index 87f1875..bbc9b8d 100644 --- a/mix.exs +++ b/mix.exs @@ -3,8 +3,8 @@ defmodule Excoder.Mixfile do def project do [ app: :excoder, - version: "1.0.0", - elixir: "~> 0.10.1-dev", + version: "1.1.0", + elixir: "~> 0.11", deps: deps ] end diff --git a/test/excoder_test.exs b/test/excoder_test.exs new file mode 100644 index 0000000..34c9754 --- /dev/null +++ b/test/excoder_test.exs @@ -0,0 +1,72 @@ +Code.require_file "test_helper.exs", __DIR__ + +defmodule ExcoderTest do + use ExUnit.Case + + test "Decode single entity" do + assert(ExCoder.decode("&") == "&") + end + + test "Decode numeric entity" do + assert(ExCoder.decode("ᄀ") == "ᄀ") + end + + test "Decode hexadecimal entity" do + assert(ExCoder.decode("ģ") == "ģ") + end + + test "Decode invalid hexadecimal" do + assert(ExCoder.decode("h23;") == "h23;") + end + + test "Decode invalid numeric" do + assert(ExCoder.decode("Džg3;") == "Džg3;") + end + + test "Decode HTML" do + assert(ExCoder.decode(""" +
      address: "chat.eu.freenode.net",
       +""" +) == """ +
      address: "chat.eu.freenode.net",
       +""" + ) + end + + test "Decode invalid case" do + assert(ExCoder.decode("&AOPF;") == "&AOPF;") + end + + + + + + test "Encode simple text" do + assert(ExCoder.encode("foo and bar") == "foo and bar") + end + + test "Encode linebreak" do + assert(ExCoder.encode(" +foo +and +bar +") == " +foo +and +bar +") + end + + test "Encode scandinavian characters" do + assert(ExCoder.encode("Hääyöaie liittyy öylättiin, sanoi Åke.") == "Hääyöaie liittyy öylättiin, sanoi Åke.") + end + + test "Encode apple signs" do + assert(ExCoder.encode("⌘⌥⇧ and the last one is ⎋") == "⌘⌥⇧ and the last one is ⎋") + end + + test "Encode decode roundtrip with very difficult unicode (HE COMES)" do + zalgo = "Z͉̘͕͐ḁ̗̕ͅl̟̥̳̞̞͔ͬ̔̂̋̾gͮͫ̓̓̕o͎͉̹͕͌͂̍͐̌̋ ͕̖͖ͯĩ͇̹̬̤͕̟ͭ̇ș̼̠̹̒̂ͭͯ̓ ͉̪͍͚̗̟͚ḟ̴̙̟̯͚̭̳̼̈́û̔ͥ̒nͥͩͭ̎̃!̫͔ ̲̯̰̰̗͔ͪ͝R̭͉̬͓̜͉ͮͬͤ̃ͯ͊a̡̩̍͊i̖̺̝̻ͦ̋̾͑͌̇ņ͍̖̟͚͓ͧ͒b̻̹͉͉̘̎̄ͬ̆͗͠ơ̮̱͉͓̗̝̯ẅ͍̱́̋ͧͭ̾̐̇s̯͓̦͓̦͇͚̎ͦ̌͐̾ ̟̞̑̉̈̎̈́l̯͉̯ͭ͡ȏ̟͙̯̞̫̳̮͒ͭ̎͆l̼͒ͩ̌̀͒l̒͒ͭ̌ͩ̎͘ͅi̝̲ͬ̋̾̉̋p̢̣͉͒̾͗ͨͯö͙̳̘̜̓ͣ͒́̿ͫͅp̵͔͎͓sͅ.̶̹͙̼̺ͧ͂̒" + assert(ExCoder.decode(ExCoder.encode(zalgo)) == zalgo) + end +end diff --git a/test/test_helper.exs b/test/test_helper.exs new file mode 100644 index 0000000..4b8b246 --- /dev/null +++ b/test/test_helper.exs @@ -0,0 +1 @@ +ExUnit.start