URL analyzer rewritten as GenServer, works pretty well. Added utilities

2013-07-15 23:18:52 +03:00 · 2013-07-15 23:18:52 +03:00 · c5e925b214
commit c5e925b214
parent 14830d9c99
2 changed files with 176 additions and 23 deletions
--- a/lib/nulform/plugins/urlanalyzer.ex
+++ b/lib/nulform/plugins/urlanalyzer.ex
@ -1,4 +1,5 @@
 defmodule Nulform.Plugins.URLAnalyzer do
+    use GenServer.Behaviour
    @moduledoc """
        This is an example plugin which analyzes URLs on IRC. It scans incoming
        IRC messages for URLs and analyzes them, returning data about them.
@ -12,40 +13,160 @@ defmodule Nulform.Plugins.URLAnalyzer do
        * Try to find <title> contents from body and return it with other data
          if found.
    """
+    @timeout 10000
+    @base_options [body_format: :binary, sync: true]
+    @http_options [timeout: @timeout, autoredirect: false]
+    # We need a real user agent since some sites fail on nonstandard ones
+    @headers [{'user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'}]
+    @max_redirects 10

-    def run() do
-        run nil
+    def init() do
+        :ok = :inets.start()
+        {:ok}
    end

-    def run(parent) do
-        receive do
-            {:nulform, :set_parent, parent} ->
-                :ok = :inets.start()
-
-            {:nulform, :urlanalyze, url, id} ->
-                Kernel.spawn __MODULE__, :run_analyzer, [parent, Kernel.binary_to_list url]
+    def handle_cast(msg) do
+        case String.split msg.raw_msg do
+            [_, "PRIVMSG" | _] ->
+                urls = parse_urls msg.raw_msg
+                run_analyzer msg.buffer, nil, urls
        end

-        run parent
+        {:noreply}
    end

-    def run_analyzer(parent, url) do
-        {:ok, {{_, status, _}, headers, body}} = :httpc.request :head, {url, []}, [], []
-        [content_type | _] = String.split to_binary(headers['content-type']), ";"
+    def parse_urls(msg) do
+        regex = %R"""ix
+            (
+                (?:
+                    (?:https?://)|(?:www\.) # Catch http://, https:// and www.
+                )
+                (?:
+                    (?:\w+\-)*\w+\.
+                )+ # Hostname parts, \w separated by - and . at the end
+                (?:
+                    (?:[a-z]+(?:\w+\-)*\w+) # Top-level domain, starts with a-z
+                    \.?                     # Optional root domain dot
+                    (?::\d+)?               # Optional port number
+                )
+                (?:
+                    (?:/[^?\s]*)+   # URL path, anything non-?, non-ws separated by /
+                    (?:\?(?:\S*))?  # Optional query string, anything after ?
+                    )?              # Make the whole path & query part optional
+            )
+        """

-        if status != 200 do
-            result = {status, content_type, headers['content-length'], body}
+        Regex.scan regex, msg
+    end
+
+    def run_analyzer(parent, id, urls) when is_list urls do
+        if not Enum.empty? urls do
+            [url | rest] = urls
+            Kernel.spawn __MODULE__, :analyze_url, [parent, elem(url, 0)]
+            run_analyzer parent, id, rest
+        end
+    end
+
+    def analyze_url(parent, id, url) when is_binary url do
+        id_str = ""
+        size_str = ""
+
+        {status, type, size, title} = binary_to_list url |> analyze_url
+        if id != nil do
+            id_str = "(" <> id <> ") "
+        end
+
+        if is_number size and size > 0 do
+            size_str = "(" <> human_bytes(size) <> ")"
+        end
+
+        result = id_str
+    end
+
+    def analyze_url(url) when is_list url do
+        analyze_url url, 0
+    end
+
+    def analyze_url(url, redirects) when redirects > @max_redirects do
+        nil
+    end
+
+    def analyze_url(url, redirects) when is_list url do
+        title = nil
+        {:ok, {{_, status, _}, headers, body}} = http_head url
+
+        content_type = parse_content_type headers['content-type']
+
+        if status == 301 or status == 302 or status == 303 or status == 307 do
+            analyze_url(headers['location'], redirects + 1)
        else
-            if content_type == "text/html" do
-                {:ok, {{_, status, _}, headers, body}} = :httpc.request url
-            end
-            result = {status, content_type, headers['content_length'], body}
-        end
+            #if (status == 200 and content_type == "text/html") do
+                {:ok, {{_, status, _}, headers, body}} = http_get url
+                title = parse_title body
+                content_type = parse_content_type headers['content-type']
+            #end

-        parent <- result
+            content_length = to_binary headers['content-length']
+            if content_length != "" do
+                content_length = binary_to_integer content_length
+            end
+
+            domain = URI.parse(to_binary url).authority
+
+            [
+                status: status,
+                domain: domain,
+                type: content_type,
+                size: content_length,
+                title: Nulform.Utilities.to_utf8(to_binary title)
+            ]
+        end
    end

-    def find_urls(message) do
-        []
+    def parse_title(html) do
+        regex = %R@<title.*?>([^>]*?)<\s*?/\s*?title\s*?>@i
+        title = Regex.scan regex, html
+
+        if not Enum.empty? title do
+            Enum.at Enum.at(title, 0), 0
+        else
+            nil
+        end
+    end
+
+    def http_head(url) do
+        http_req :head, url
+    end
+
+    def http_get(url) do
+        http_req :get, url
+    end
+
+    def http_req(mode, url) do
+        :httpc.request mode, {url, @headers}, @http_options, @base_options
+    end
+
+    def human_bytes(size) do
+        human_bytes size, 1000
+    end
+
+    def human_bytes(size, factor) do
+        human_bytes size, factor, 2
+    end
+
+    def human_bytes(size, factor, decimals) do
+        human_bytes size, factor, decimals, ["B", "kB", "MB", "GB", "TP", "PB"]
+    end
+
+    def human_bytes(size, factor, decimals, [ext | rest]) when size >= factor do
+        human_bytes size / factor, factor, decimals, rest
+    end
+
+    def human_bytes(size, factor, decimals, [ext | rest]) do
+        float_to_binary(size, [decimals: decimals]) <> " " <> ext
+    end
+
+    defp parse_content_type(header) do
+        Enum.at String.split(to_binary(header), ";"), 0
    end
 end
--- a/lib/nulform/utilities.ex
+++ b/lib/nulform/utilities.ex
@ -0,0 +1,32 @@
+defmodule Nulform.Utilities do
+    @moduledoc """
+    This module contains utilities for the bot that don't really go anywhere
+    else.
+    """
+
+    @doc """
+    Convert a string to UTF-8, trying valiantly to maintain the original
+    content. There may be no way to really know the encoding of the input, so we
+    will first try it as UTF-8 and then as latin-1 if it was not valid.
+    Unfortunately there is *no* feasible way to guess between latin-1 and all of
+    the other 8-bit encodings if we don't resort to smelly heuristics, so
+    latin-1 is all we will support.
+
+    Don't you just love character encodings?
+    """
+    def to_utf8(string) do
+        if String.valid? string do
+            string
+        else
+            latin1_to_utf8 string
+        end
+    end
+
+    @doc """
+    Convert a latin-1 string to UTF-8. Will give the wrong results if the
+    original string is not actually latin-1.
+    """
+    def latin1_to_utf8(string) do
+        :unicode.characters_to_binary string, :latin1, :utf8
+    end
+end