URL analyzer rewritten as GenServer, works pretty well. Added utilities

This commit is contained in:
Mikko Ahlroth 2013-07-15 23:18:52 +03:00
parent 14830d9c99
commit c5e925b214
2 changed files with 176 additions and 23 deletions

View file

@ -1,4 +1,5 @@
defmodule Nulform.Plugins.URLAnalyzer do
use GenServer.Behaviour
@moduledoc """
This is an example plugin which analyzes URLs on IRC. It scans incoming
IRC messages for URLs and analyzes them, returning data about them.
@ -12,40 +13,160 @@ defmodule Nulform.Plugins.URLAnalyzer do
* Try to find <title> contents from body and return it with other data
if found.
"""
@timeout 10000
@base_options [body_format: :binary, sync: true]
@http_options [timeout: @timeout, autoredirect: false]
# We need a real user agent since some sites fail on nonstandard ones
@headers [{'user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'}]
@max_redirects 10
def run() do
run nil
def init() do
:ok = :inets.start()
{:ok}
end
def run(parent) do
receive do
{:nulform, :set_parent, parent} ->
:ok = :inets.start()
{:nulform, :urlanalyze, url, id} ->
Kernel.spawn __MODULE__, :run_analyzer, [parent, Kernel.binary_to_list url]
def handle_cast(msg) do
case String.split msg.raw_msg do
[_, "PRIVMSG" | _] ->
urls = parse_urls msg.raw_msg
run_analyzer msg.buffer, nil, urls
end
run parent
{:noreply}
end
def run_analyzer(parent, url) do
{:ok, {{_, status, _}, headers, body}} = :httpc.request :head, {url, []}, [], []
[content_type | _] = String.split to_binary(headers['content-type']), ";"
def parse_urls(msg) do
regex = %R"""ix
(
(?:
(?:https?://)|(?:www\.) # Catch http://, https:// and www.
)
(?:
(?:\w+\-)*\w+\.
)+ # Hostname parts, \w separated by - and . at the end
(?:
(?:[a-z]+(?:\w+\-)*\w+) # Top-level domain, starts with a-z
\.? # Optional root domain dot
(?::\d+)? # Optional port number
)
(?:
(?:/[^?\s]*)+ # URL path, anything non-?, non-ws separated by /
(?:\?(?:\S*))? # Optional query string, anything after ?
)? # Make the whole path & query part optional
)
"""
if status != 200 do
result = {status, content_type, headers['content-length'], body}
Regex.scan regex, msg
end
def run_analyzer(parent, id, urls) when is_list urls do
if not Enum.empty? urls do
[url | rest] = urls
Kernel.spawn __MODULE__, :analyze_url, [parent, elem(url, 0)]
run_analyzer parent, id, rest
end
end
def analyze_url(parent, id, url) when is_binary url do
id_str = ""
size_str = ""
{status, type, size, title} = binary_to_list url |> analyze_url
if id != nil do
id_str = "(" <> id <> ") "
end
if is_number size and size > 0 do
size_str = "(" <> human_bytes(size) <> ")"
end
result = id_str
end
def analyze_url(url) when is_list url do
analyze_url url, 0
end
def analyze_url(url, redirects) when redirects > @max_redirects do
nil
end
def analyze_url(url, redirects) when is_list url do
title = nil
{:ok, {{_, status, _}, headers, body}} = http_head url
content_type = parse_content_type headers['content-type']
if status == 301 or status == 302 or status == 303 or status == 307 do
analyze_url(headers['location'], redirects + 1)
else
if content_type == "text/html" do
{:ok, {{_, status, _}, headers, body}} = :httpc.request url
end
result = {status, content_type, headers['content_length'], body}
end
#if (status == 200 and content_type == "text/html") do
{:ok, {{_, status, _}, headers, body}} = http_get url
title = parse_title body
content_type = parse_content_type headers['content-type']
#end
parent <- result
content_length = to_binary headers['content-length']
if content_length != "" do
content_length = binary_to_integer content_length
end
domain = URI.parse(to_binary url).authority
[
status: status,
domain: domain,
type: content_type,
size: content_length,
title: Nulform.Utilities.to_utf8(to_binary title)
]
end
end
def find_urls(message) do
[]
def parse_title(html) do
regex = %R@<title.*?>([^>]*?)<\s*?/\s*?title\s*?>@i
title = Regex.scan regex, html
if not Enum.empty? title do
Enum.at Enum.at(title, 0), 0
else
nil
end
end
def http_head(url) do
http_req :head, url
end
def http_get(url) do
http_req :get, url
end
def http_req(mode, url) do
:httpc.request mode, {url, @headers}, @http_options, @base_options
end
def human_bytes(size) do
human_bytes size, 1000
end
def human_bytes(size, factor) do
human_bytes size, factor, 2
end
def human_bytes(size, factor, decimals) do
human_bytes size, factor, decimals, ["B", "kB", "MB", "GB", "TP", "PB"]
end
def human_bytes(size, factor, decimals, [ext | rest]) when size >= factor do
human_bytes size / factor, factor, decimals, rest
end
def human_bytes(size, factor, decimals, [ext | rest]) do
float_to_binary(size, [decimals: decimals]) <> " " <> ext
end
defp parse_content_type(header) do
Enum.at String.split(to_binary(header), ";"), 0
end
end

32
lib/nulform/utilities.ex Normal file
View file

@ -0,0 +1,32 @@
defmodule Nulform.Utilities do
@moduledoc """
This module contains utilities for the bot that don't really go anywhere
else.
"""
@doc """
Convert a string to UTF-8, trying valiantly to maintain the original
content. There may be no way to really know the encoding of the input, so we
will first try it as UTF-8 and then as latin-1 if it was not valid.
Unfortunately there is *no* feasible way to guess between latin-1 and all of
the other 8-bit encodings if we don't resort to smelly heuristics, so
latin-1 is all we will support.
Don't you just love character encodings?
"""
def to_utf8(string) do
if String.valid? string do
string
else
latin1_to_utf8 string
end
end
@doc """
Convert a latin-1 string to UTF-8. Will give the wrong results if the
original string is not actually latin-1.
"""
def latin1_to_utf8(string) do
:unicode.characters_to_binary string, :latin1, :utf8
end
end