URL analyzer rewritten as GenServer, works pretty well. Added utilities

2013-07-15 23:18:52 +03:00 · 2013-07-15 23:18:52 +03:00 · c5e925b214
commit c5e925b214
parent 14830d9c99
2 changed files with 176 additions and 23 deletions
--- a/lib/nulform/plugins/urlanalyzer.ex
+++ b/lib/nulform/plugins/urlanalyzer.ex
@ -1,4 +1,5 @@
 defmodule Nulform.Plugins.URLAnalyzer do
    use GenServer.Behaviour
    @moduledoc """
        This is an example plugin which analyzes URLs on IRC. It scans incoming
        IRC messages for URLs and analyzes them, returning data about them.
@ -12,40 +13,160 @@ defmodule Nulform.Plugins.URLAnalyzer do
        * Try to find <title> contents from body and return it with other data
          if found.
    """
    @timeout 10000
    @base_options [body_format: :binary, sync: true]
    @http_options [timeout: @timeout, autoredirect: false]
    # We need a real user agent since some sites fail on nonstandard ones
    @headers [{'user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'}]
    @max_redirects 10
-    def run() do
+    def init() do
        run nil
    end
    def run(parent) do
        receive do
            {:nulform, :set_parent, parent} ->
        :ok = :inets.start()
-
+        {:ok}
            {:nulform, :urlanalyze, url, id} ->
                Kernel.spawn __MODULE__, :run_analyzer, [parent, Kernel.binary_to_list url]
    end
-        run parent
+    def handle_cast(msg) do
        case String.split msg.raw_msg do
            [_, "PRIVMSG" | _] ->
                urls = parse_urls msg.raw_msg
                run_analyzer msg.buffer, nil, urls
        end
-    def run_analyzer(parent, url) do
+        {:noreply}
-        {:ok, {{_, status, _}, headers, body}} = :httpc.request :head, {url, []}, [], []
+    end
        [content_type | _] = String.split to_binary(headers['content-type']), ";"
-        if status != 200 do
+    def parse_urls(msg) do
-            result = {status, content_type, headers['content-length'], body}
+        regex = %R"""ix
            (
                (?:
                    (?:https?://)|(?:www\.) # Catch http://, https:// and www.
                )
                (?:
                    (?:\w+\-)*\w+\.
                )+ # Hostname parts, \w separated by - and . at the end
                (?:
                    (?:[a-z]+(?:\w+\-)*\w+) # Top-level domain, starts with a-z
                    \.?                     # Optional root domain dot
                    (?::\d+)?               # Optional port number
                )
                (?:
                    (?:/[^?\s]*)+   # URL path, anything non-?, non-ws separated by /
                    (?:\?(?:\S*))?  # Optional query string, anything after ?
                    )?              # Make the whole path & query part optional
            )
        """
        Regex.scan regex, msg
    end
    def run_analyzer(parent, id, urls) when is_list urls do
        if not Enum.empty? urls do
            [url | rest] = urls
            Kernel.spawn __MODULE__, :analyze_url, [parent, elem(url, 0)]
            run_analyzer parent, id, rest
        end
    end
    def analyze_url(parent, id, url) when is_binary url do
        id_str = ""
        size_str = ""
        {status, type, size, title} = binary_to_list url |> analyze_url
        if id != nil do
            id_str = "(" <> id <> ") "
        end
        if is_number size and size > 0 do
            size_str = "(" <> human_bytes(size) <> ")"
        end
        result = id_str
    end
    def analyze_url(url) when is_list url do
        analyze_url url, 0
    end
    def analyze_url(url, redirects) when redirects > @max_redirects do
        nil
    end
    def analyze_url(url, redirects) when is_list url do
        title = nil
        {:ok, {{_, status, _}, headers, body}} = http_head url
        content_type = parse_content_type headers['content-type']
        if status == 301 or status == 302 or status == 303 or status == 307 do
            analyze_url(headers['location'], redirects + 1)
        else
-            if content_type == "text/html" do
+            #if (status == 200 and content_type == "text/html") do
-                {:ok, {{_, status, _}, headers, body}} = :httpc.request url
+                {:ok, {{_, status, _}, headers, body}} = http_get url
-            end
+                title = parse_title body
-            result = {status, content_type, headers['content_length'], body}
+                content_type = parse_content_type headers['content-type']
            #end
            content_length = to_binary headers['content-length']
            if content_length != "" do
                content_length = binary_to_integer content_length
            end
-        parent <- result
+            domain = URI.parse(to_binary url).authority
            [
                status: status,
                domain: domain,
                type: content_type,
                size: content_length,
                title: Nulform.Utilities.to_utf8(to_binary title)
            ]
        end
    end
-    def find_urls(message) do
+    def parse_title(html) do
-        []
+        regex = %R@<title.*?>([^>]*?)<\s*?/\s*?title\s*?>@i
        title = Regex.scan regex, html
        if not Enum.empty? title do
            Enum.at Enum.at(title, 0), 0
        else
            nil
        end
    end
    def http_head(url) do
        http_req :head, url
    end
    def http_get(url) do
        http_req :get, url
    end
    def http_req(mode, url) do
        :httpc.request mode, {url, @headers}, @http_options, @base_options
    end
    def human_bytes(size) do
        human_bytes size, 1000
    end
    def human_bytes(size, factor) do
        human_bytes size, factor, 2
    end
    def human_bytes(size, factor, decimals) do
        human_bytes size, factor, decimals, ["B", "kB", "MB", "GB", "TP", "PB"]
    end
    def human_bytes(size, factor, decimals, [ext | rest]) when size >= factor do
        human_bytes size / factor, factor, decimals, rest
    end
    def human_bytes(size, factor, decimals, [ext | rest]) do
        float_to_binary(size, [decimals: decimals]) <> " " <> ext
    end
    defp parse_content_type(header) do
        Enum.at String.split(to_binary(header), ";"), 0
    end
 end
--- a/lib/nulform/utilities.ex
+++ b/lib/nulform/utilities.ex
@ -0,0 +1,32 @@
 defmodule Nulform.Utilities do
    @moduledoc """
    This module contains utilities for the bot that don't really go anywhere
    else.
    """
    @doc """
    Convert a string to UTF-8, trying valiantly to maintain the original
    content. There may be no way to really know the encoding of the input, so we
    will first try it as UTF-8 and then as latin-1 if it was not valid.
    Unfortunately there is *no* feasible way to guess between latin-1 and all of
    the other 8-bit encodings if we don't resort to smelly heuristics, so
    latin-1 is all we will support.
    Don't you just love character encodings?
    """
    def to_utf8(string) do
        if String.valid? string do
            string
        else
            latin1_to_utf8 string
        end
    end
    @doc """
    Convert a latin-1 string to UTF-8. Will give the wrong results if the
    original string is not actually latin-1.
    """
    def latin1_to_utf8(string) do
        :unicode.characters_to_binary string, :latin1, :utf8
    end
 end