From c5e925b21414cf6c1ccc052a3f20e7ca536d55c2 Mon Sep 17 00:00:00 2001 From: Mikko Ahlroth Date: Mon, 15 Jul 2013 23:18:52 +0300 Subject: [PATCH] URL analyzer rewritten as GenServer, works pretty well. Added utilities --- lib/nulform/plugins/urlanalyzer.ex | 167 +++++++++++++++++++++++++---- lib/nulform/utilities.ex | 32 ++++++ 2 files changed, 176 insertions(+), 23 deletions(-) create mode 100644 lib/nulform/utilities.ex diff --git a/lib/nulform/plugins/urlanalyzer.ex b/lib/nulform/plugins/urlanalyzer.ex index 74293a8..4ac4ce5 100644 --- a/lib/nulform/plugins/urlanalyzer.ex +++ b/lib/nulform/plugins/urlanalyzer.ex @@ -1,4 +1,5 @@ defmodule Nulform.Plugins.URLAnalyzer do + use GenServer.Behaviour @moduledoc """ This is an example plugin which analyzes URLs on IRC. It scans incoming IRC messages for URLs and analyzes them, returning data about them. @@ -12,40 +13,160 @@ defmodule Nulform.Plugins.URLAnalyzer do * Try to find contents from body and return it with other data if found. """ + @timeout 10000 + @base_options [body_format: :binary, sync: true] + @http_options [timeout: @timeout, autoredirect: false] + # We need a real user agent since some sites fail on nonstandard ones + @headers [{'user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'}] + @max_redirects 10 - def run() do - run nil + def init() do + :ok = :inets.start() + {:ok} end - def run(parent) do - receive do - {:nulform, :set_parent, parent} -> - :ok = :inets.start() - - {:nulform, :urlanalyze, url, id} -> - Kernel.spawn __MODULE__, :run_analyzer, [parent, Kernel.binary_to_list url] + def handle_cast(msg) do + case String.split msg.raw_msg do + [_, "PRIVMSG" | _] -> + urls = parse_urls msg.raw_msg + run_analyzer msg.buffer, nil, urls end - run parent + {:noreply} end - def run_analyzer(parent, url) do - {:ok, {{_, status, _}, headers, body}} = :httpc.request :head, {url, []}, [], [] - [content_type | _] = String.split to_binary(headers['content-type']), ";" + def parse_urls(msg) do + regex = %R"""ix + ( + (?: + (?:https?://)|(?:www\.) # Catch http://, https:// and www. + ) + (?: + (?:\w+\-)*\w+\. + )+ # Hostname parts, \w separated by - and . at the end + (?: + (?:[a-z]+(?:\w+\-)*\w+) # Top-level domain, starts with a-z + \.? # Optional root domain dot + (?::\d+)? # Optional port number + ) + (?: + (?:/[^?\s]*)+ # URL path, anything non-?, non-ws separated by / + (?:\?(?:\S*))? # Optional query string, anything after ? + )? # Make the whole path & query part optional + ) + """ - if status != 200 do - result = {status, content_type, headers['content-length'], body} + Regex.scan regex, msg + end + + def run_analyzer(parent, id, urls) when is_list urls do + if not Enum.empty? urls do + [url | rest] = urls + Kernel.spawn __MODULE__, :analyze_url, [parent, elem(url, 0)] + run_analyzer parent, id, rest + end + end + + def analyze_url(parent, id, url) when is_binary url do + id_str = "" + size_str = "" + + {status, type, size, title} = binary_to_list url |> analyze_url + if id != nil do + id_str = "(" <> id <> ") " + end + + if is_number size and size > 0 do + size_str = "(" <> human_bytes(size) <> ")" + end + + result = id_str + end + + def analyze_url(url) when is_list url do + analyze_url url, 0 + end + + def analyze_url(url, redirects) when redirects > @max_redirects do + nil + end + + def analyze_url(url, redirects) when is_list url do + title = nil + {:ok, {{_, status, _}, headers, body}} = http_head url + + content_type = parse_content_type headers['content-type'] + + if status == 301 or status == 302 or status == 303 or status == 307 do + analyze_url(headers['location'], redirects + 1) else - if content_type == "text/html" do - {:ok, {{_, status, _}, headers, body}} = :httpc.request url - end - result = {status, content_type, headers['content_length'], body} - end + #if (status == 200 and content_type == "text/html") do + {:ok, {{_, status, _}, headers, body}} = http_get url + title = parse_title body + content_type = parse_content_type headers['content-type'] + #end - parent <- result + content_length = to_binary headers['content-length'] + if content_length != "" do + content_length = binary_to_integer content_length + end + + domain = URI.parse(to_binary url).authority + + [ + status: status, + domain: domain, + type: content_type, + size: content_length, + title: Nulform.Utilities.to_utf8(to_binary title) + ] + end end - def find_urls(message) do - [] + def parse_title(html) do + regex = %R@<title.*?>([^>]*?)<\s*?/\s*?title\s*?>@i + title = Regex.scan regex, html + + if not Enum.empty? title do + Enum.at Enum.at(title, 0), 0 + else + nil + end + end + + def http_head(url) do + http_req :head, url + end + + def http_get(url) do + http_req :get, url + end + + def http_req(mode, url) do + :httpc.request mode, {url, @headers}, @http_options, @base_options + end + + def human_bytes(size) do + human_bytes size, 1000 + end + + def human_bytes(size, factor) do + human_bytes size, factor, 2 + end + + def human_bytes(size, factor, decimals) do + human_bytes size, factor, decimals, ["B", "kB", "MB", "GB", "TP", "PB"] + end + + def human_bytes(size, factor, decimals, [ext | rest]) when size >= factor do + human_bytes size / factor, factor, decimals, rest + end + + def human_bytes(size, factor, decimals, [ext | rest]) do + float_to_binary(size, [decimals: decimals]) <> " " <> ext + end + + defp parse_content_type(header) do + Enum.at String.split(to_binary(header), ";"), 0 end end diff --git a/lib/nulform/utilities.ex b/lib/nulform/utilities.ex new file mode 100644 index 0000000..0ebc35d --- /dev/null +++ b/lib/nulform/utilities.ex @@ -0,0 +1,32 @@ +defmodule Nulform.Utilities do + @moduledoc """ + This module contains utilities for the bot that don't really go anywhere + else. + """ + + @doc """ + Convert a string to UTF-8, trying valiantly to maintain the original + content. There may be no way to really know the encoding of the input, so we + will first try it as UTF-8 and then as latin-1 if it was not valid. + Unfortunately there is *no* feasible way to guess between latin-1 and all of + the other 8-bit encodings if we don't resort to smelly heuristics, so + latin-1 is all we will support. + + Don't you just love character encodings? + """ + def to_utf8(string) do + if String.valid? string do + string + else + latin1_to_utf8 string + end + end + + @doc """ + Convert a latin-1 string to UTF-8. Will give the wrong results if the + original string is not actually latin-1. + """ + def latin1_to_utf8(string) do + :unicode.characters_to_binary string, :latin1, :utf8 + end +end