URL analyzer rewritten as GenServer, works pretty well. Added utilities

This commit is contained in:
Mikko Ahlroth 2013-07-15 23:18:52 +03:00
parent 14830d9c99
commit c5e925b214
2 changed files with 176 additions and 23 deletions

View file

@ -1,4 +1,5 @@
defmodule Nulform.Plugins.URLAnalyzer do defmodule Nulform.Plugins.URLAnalyzer do
use GenServer.Behaviour
@moduledoc """ @moduledoc """
This is an example plugin which analyzes URLs on IRC. It scans incoming This is an example plugin which analyzes URLs on IRC. It scans incoming
IRC messages for URLs and analyzes them, returning data about them. IRC messages for URLs and analyzes them, returning data about them.
@ -12,40 +13,160 @@ defmodule Nulform.Plugins.URLAnalyzer do
* Try to find <title> contents from body and return it with other data * Try to find <title> contents from body and return it with other data
if found. if found.
""" """
@timeout 10000
@base_options [body_format: :binary, sync: true]
@http_options [timeout: @timeout, autoredirect: false]
# We need a real user agent since some sites fail on nonstandard ones
@headers [{'user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'}]
@max_redirects 10
def run() do def init() do
run nil
end
def run(parent) do
receive do
{:nulform, :set_parent, parent} ->
:ok = :inets.start() :ok = :inets.start()
{:ok}
{:nulform, :urlanalyze, url, id} ->
Kernel.spawn __MODULE__, :run_analyzer, [parent, Kernel.binary_to_list url]
end end
run parent def handle_cast(msg) do
case String.split msg.raw_msg do
[_, "PRIVMSG" | _] ->
urls = parse_urls msg.raw_msg
run_analyzer msg.buffer, nil, urls
end end
def run_analyzer(parent, url) do {:noreply}
{:ok, {{_, status, _}, headers, body}} = :httpc.request :head, {url, []}, [], [] end
[content_type | _] = String.split to_binary(headers['content-type']), ";"
if status != 200 do def parse_urls(msg) do
result = {status, content_type, headers['content-length'], body} regex = %R"""ix
(
(?:
(?:https?://)|(?:www\.) # Catch http://, https:// and www.
)
(?:
(?:\w+\-)*\w+\.
)+ # Hostname parts, \w separated by - and . at the end
(?:
(?:[a-z]+(?:\w+\-)*\w+) # Top-level domain, starts with a-z
\.? # Optional root domain dot
(?::\d+)? # Optional port number
)
(?:
(?:/[^?\s]*)+ # URL path, anything non-?, non-ws separated by /
(?:\?(?:\S*))? # Optional query string, anything after ?
)? # Make the whole path & query part optional
)
"""
Regex.scan regex, msg
end
def run_analyzer(parent, id, urls) when is_list urls do
if not Enum.empty? urls do
[url | rest] = urls
Kernel.spawn __MODULE__, :analyze_url, [parent, elem(url, 0)]
run_analyzer parent, id, rest
end
end
def analyze_url(parent, id, url) when is_binary url do
id_str = ""
size_str = ""
{status, type, size, title} = binary_to_list url |> analyze_url
if id != nil do
id_str = "(" <> id <> ") "
end
if is_number size and size > 0 do
size_str = "(" <> human_bytes(size) <> ")"
end
result = id_str
end
def analyze_url(url) when is_list url do
analyze_url url, 0
end
def analyze_url(url, redirects) when redirects > @max_redirects do
nil
end
def analyze_url(url, redirects) when is_list url do
title = nil
{:ok, {{_, status, _}, headers, body}} = http_head url
content_type = parse_content_type headers['content-type']
if status == 301 or status == 302 or status == 303 or status == 307 do
analyze_url(headers['location'], redirects + 1)
else else
if content_type == "text/html" do #if (status == 200 and content_type == "text/html") do
{:ok, {{_, status, _}, headers, body}} = :httpc.request url {:ok, {{_, status, _}, headers, body}} = http_get url
end title = parse_title body
result = {status, content_type, headers['content_length'], body} content_type = parse_content_type headers['content-type']
#end
content_length = to_binary headers['content-length']
if content_length != "" do
content_length = binary_to_integer content_length
end end
parent <- result domain = URI.parse(to_binary url).authority
[
status: status,
domain: domain,
type: content_type,
size: content_length,
title: Nulform.Utilities.to_utf8(to_binary title)
]
end
end end
def find_urls(message) do def parse_title(html) do
[] regex = %R@<title.*?>([^>]*?)<\s*?/\s*?title\s*?>@i
title = Regex.scan regex, html
if not Enum.empty? title do
Enum.at Enum.at(title, 0), 0
else
nil
end
end
def http_head(url) do
http_req :head, url
end
def http_get(url) do
http_req :get, url
end
def http_req(mode, url) do
:httpc.request mode, {url, @headers}, @http_options, @base_options
end
def human_bytes(size) do
human_bytes size, 1000
end
def human_bytes(size, factor) do
human_bytes size, factor, 2
end
def human_bytes(size, factor, decimals) do
human_bytes size, factor, decimals, ["B", "kB", "MB", "GB", "TP", "PB"]
end
def human_bytes(size, factor, decimals, [ext | rest]) when size >= factor do
human_bytes size / factor, factor, decimals, rest
end
def human_bytes(size, factor, decimals, [ext | rest]) do
float_to_binary(size, [decimals: decimals]) <> " " <> ext
end
defp parse_content_type(header) do
Enum.at String.split(to_binary(header), ";"), 0
end end
end end

32
lib/nulform/utilities.ex Normal file
View file

@ -0,0 +1,32 @@
defmodule Nulform.Utilities do
@moduledoc """
This module contains utilities for the bot that don't really go anywhere
else.
"""
@doc """
Convert a string to UTF-8, trying valiantly to maintain the original
content. There may be no way to really know the encoding of the input, so we
will first try it as UTF-8 and then as latin-1 if it was not valid.
Unfortunately there is *no* feasible way to guess between latin-1 and all of
the other 8-bit encodings if we don't resort to smelly heuristics, so
latin-1 is all we will support.
Don't you just love character encodings?
"""
def to_utf8(string) do
if String.valid? string do
string
else
latin1_to_utf8 string
end
end
@doc """
Convert a latin-1 string to UTF-8. Will give the wrong results if the
original string is not actually latin-1.
"""
def latin1_to_utf8(string) do
:unicode.characters_to_binary string, :latin1, :utf8
end
end