From b548fdd1bb00dc07443222a38332f5869e627026 Mon Sep 17 00:00:00 2001 From: Mikko Ahlroth Date: Wed, 2 Oct 2013 11:33:51 +0300 Subject: [PATCH] Fix stuff deprecated in Elixir 0.10.2, initial IPv6 and gzip support for url analyzer --- lib/connection.ex | 6 ++-- lib/plugins/urlanalyzer.ex | 68 +++++++++++++++++++++++++------------- mix.exs | 4 +-- 3 files changed, 50 insertions(+), 28 deletions(-) diff --git a/lib/connection.ex b/lib/connection.ex index 6391d06..3ba96b0 100644 --- a/lib/connection.ex +++ b/lib/connection.ex @@ -36,7 +36,7 @@ defmodule Nulform.Connection do end def handle_cast(:connect, data) do - {:ok, sock} = connect binary_to_list(data.host), data.port + {:ok, sock} = connect String.to_char_list!(data.host), data.port data = data.sock sock send_connect_info data @@ -88,7 +88,7 @@ defmodule Nulform.Connection do case String.split stripped do [_, "433" | _] -> send data.buffer, "NICK " <> data.altnick - uniqid = to_binary :random.uniform(9999) + uniqid = to_string :random.uniform(9999) data = data.altnick String.slice(data.altnick, 0, 10) <> "-" <> uniqid _ -> end @@ -120,7 +120,7 @@ defmodule Nulform.Connection do defp send_raw(data, msg) do :ok = :gen_tcp.send data.sock, String.slice(msg, 0, @max_len) <> "\r\n" - IO.puts(to_binary(data.id) <> " <- " <> String.slice(msg, 0, @max_len)) + IO.puts(to_string(data.id) <> " <- " <> String.slice(msg, 0, @max_len)) end defp tell_handler(data, msg) do diff --git a/lib/plugins/urlanalyzer.ex b/lib/plugins/urlanalyzer.ex index 108ec85..4c03de8 100644 --- a/lib/plugins/urlanalyzer.ex +++ b/lib/plugins/urlanalyzer.ex @@ -17,13 +17,16 @@ defmodule Nulform.Plugins.URLAnalyzer do @base_options [body_format: :binary, sync: true] @http_options [timeout: @timeout, autoredirect: false] # We need a real user agent since some sites fail on nonstandard ones - @headers [{'user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'}] + # Also request sites as non-compressed + @headers [{'user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'}, + {'accept-encoding', 'identity;q=1.0,gzip;q=0.5,*;q=0'}] @max_redirects 10 @parseable_types [ "text/html", "text/xml", "application/xml", "application/xhtml+xml" ] @domain_blacklist [ - %R/(192|127|10)\.\d{1,3}\.\d{1,3}\.\d{1,3}/, %R/localhost/i + %R/^(192|127|10)\.\d{1,3}\.\d{1,3}\.\d{1,3}$/, %R/^localhost$/i, + %R/^\[(0*:)*:?:1\]$/ ] def init(nil) do @@ -54,16 +57,22 @@ defmodule Nulform.Plugins.URLAnalyzer do (?: (?:https?://)|(?:www\.) # Catch http://, https:// and www. ) - (?: - (?:\w+\-)*\w+\. - )+ # Hostname parts, \w separated by - and . at the end - (?: + ( # Domain or IPv4 (?: - (?:[a-z]+(?:\w+\-)*\w+) # Top-level domain, starts with a-z - \.? # Optional root domain dot + (?:\w+\-)*\w+\. + )+ # Hostname parts, \w separated by - and . at the end + (?: + (?: + (?:[a-z]+(?:\w+\-)*\w+) # Top-level domain, starts with a-z + \.? # Optional root domain dot + ) + | + (?:\d{1,3}) # Or an IP address final term ) - | - (?:\d{1,3}) # Or an IP address final term + | # IPv6 + \[ + (?:[0-9a-f]:?)+ + \] ) (?::\d+)? # Optional port number (?: @@ -79,18 +88,18 @@ defmodule Nulform.Plugins.URLAnalyzer do def run_analyzer(msg, id, urls) when is_list urls do if not Enum.empty? urls do [url | rest] = urls - Kernel.spawn __MODULE__, :analyze_url, [msg, id, Enum.at(url, 0)] + Kernel.spawn __MODULE__, :run_analyzer, [msg, id, Enum.at(url, 0)] id = if(id != nil, do: id + 1) run_analyzer msg, id, rest end end - def analyze_url(msg, id, url) when is_binary url do + def run_analyzer(msg, id, url) when is_binary url do id_str = "" size_str = "" if id != nil do - id_str = "(" <> to_binary(id) <> ") " + id_str = "(" <> to_string(id) <> ") " end analysis = analyze_url url @@ -116,7 +125,7 @@ defmodule Nulform.Plugins.URLAnalyzer do IO.inspect result end else - result = result <> "HTTP " <> to_binary status + result = result <> "HTTP " <> to_string status end {:error, error, domain} -> @@ -143,12 +152,12 @@ defmodule Nulform.Plugins.URLAnalyzer do analyze_url url, redirects, :head end - def analyze_url(url, redirects, mode) when redirects > @max_redirects do + def analyze_url(url, redirects, _) when redirects > @max_redirects do {:error, :max_redirects, URI.parse(url).authority} end def analyze_url(url, redirects, mode) do - IO.puts "Analyzing " <> url <> " round " <> to_binary redirects + IO.puts "Analyzing " <> url <> " round " <> to_string redirects title = "" if String.starts_with? url, "www." do @@ -164,8 +173,8 @@ defmodule Nulform.Plugins.URLAnalyzer do false -> result = case mode do - :head -> http_head binary_to_list(url) - :get -> http_get binary_to_list(url) + :head -> http_head String.to_char_list!(url) + :get -> http_get String.to_char_list!(url) _ -> {:error, :unknown_method, domain} # fail end @@ -174,7 +183,7 @@ defmodule Nulform.Plugins.URLAnalyzer do content_type = parse_content_type headers['content-type'] if status == 301 or status == 302 or status == 303 or status == 307 do - new_url = to_binary headers['location'] + new_url = to_string headers['location'] # Fix non-absolute location URIs by retard webdevs if not String.starts_with? String.downcase(new_url), ["http://", "https://"] do IO.puts "Fixing " <> new_url <> " to..." @@ -186,6 +195,9 @@ defmodule Nulform.Plugins.URLAnalyzer do if mode != :get and (status != 200 or Enum.any? @parseable_types, fn(x) -> x == content_type end) do analyze_url url, redirects + 1, :get else + # Maybe gunzip body since retard server send gzip for us + body = maybe_gunzip headers, body + IO.inspect mode IO.inspect status IO.inspect content_type @@ -195,7 +207,7 @@ defmodule Nulform.Plugins.URLAnalyzer do end IO.inspect Nulform.Utilities.to_utf8(title) - content_length = to_binary headers['content-length'] + content_length = to_string headers['content-length'] if content_length != "" do content_length = binary_to_integer content_length end @@ -220,7 +232,7 @@ defmodule Nulform.Plugins.URLAnalyzer do end def parse_title(html) do - regex = %R@([^>]*?)<\s*?/\s*?title\s*?>@i + regex = %R@([^<]*?)<\s*?/\s*?title\s*?>@i title = Regex.scan regex, html if not Enum.empty? title do @@ -231,6 +243,16 @@ defmodule Nulform.Plugins.URLAnalyzer do end end + # Some sites send us gzipped content even though we don't ask for it, so + # check if we need to unzip + def maybe_gunzip(headers, body) do + if headers['content-encoding'] == 'gzip' do + :zlib.gunzip body + else + body + end + end + def http_head(url) do http_req :head, url end @@ -243,7 +265,7 @@ defmodule Nulform.Plugins.URLAnalyzer do :httpc.request mode, {url, @headers}, @http_options, @base_options end - defp parse_content_type(header) do - Enum.at String.split(to_binary(header), ";"), 0 + def parse_content_type(header) do + Enum.at String.split(to_string(header), ";"), 0 end end diff --git a/mix.exs b/mix.exs index 8730ab7..2f01641 100644 --- a/mix.exs +++ b/mix.exs @@ -18,8 +18,8 @@ defmodule Nulform.Mixfile do # { :foobar, "0.1", git: "https://github.com/elixir-lang/foobar.git" } defp deps do [ - {:json, "0.0.2", git: "https://github.com/hio/erlang-json"}, - {:excoder, "0.0.1", git: "https://Nicd@bitbucket.org/Nicd/excoder.git"} + {:json, github: "cblage/elixir-json"}, + {:excoder, "1.0.0", git: "https://Nicd@bitbucket.org/Nicd/excoder.git"} ] end end