nulform/lib/plugins/urlanalyzer.ex

266 lines
9.3 KiB
Elixir
Raw Normal View History

2013-06-27 20:56:27 +00:00
defmodule Nulform.Plugins.URLAnalyzer do
use GenServer.Behaviour
2013-06-27 20:56:27 +00:00
@moduledoc """
This is an example plugin which analyzes URLs on IRC. It scans incoming
IRC messages for URLs and analyzes them, returning data about them.
The algorithm for analyzing URLs is as follows:
* Send HEAD request to URL. Store content-length, content-type, and
HTTP code.
* If an error happened, return the collected data.
* If request succeeded and content-type is text/html, issue a GET
request to the URL.
* Try to find <title> contents from body and return it with other data
if found.
"""
@timeout 10000
@base_options [body_format: :binary, sync: true]
@http_options [timeout: @timeout, autoredirect: false]
# We need a real user agent since some sites fail on nonstandard ones
@headers [{'user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'}]
@max_redirects 10
@parseable_types [
"text/html", "text/xml", "application/xml", "application/xhtml+xml"
]
@domain_blacklist [
%R/(192|127|10)\.\d{1,3}\.\d{1,3}\.\d{1,3}/, %R/localhost/i
]
def init(nil) do
# If these fail, let them fail, we'll crash later
:inets.start()
:ssl.start()
{:ok, nil}
end
def handle_cast(msg, nil) do
IO.inspect msg
in_msg = Nulform.IRC.PRIVMSG.parse msg
if in_msg != nil do
IO.puts "Message parsed!"
urls = parse_urls in_msg.text
IO.inspect urls
id = if(Enum.count(urls) > 1, do: 0)
run_analyzer in_msg, id, urls
end
{:noreply, nil}
end
def parse_urls(msg) do
regex = %R"""ix
(
(?:
(?:https?://)|(?:www\.) # Catch http://, https:// and www.
)
(?:
(?:\w+\-)*\w+\.
)+ # Hostname parts, \w separated by - and . at the end
2013-08-03 20:30:08 +00:00
(?:
(?:
(?:[a-z]+(?:\w+\-)*\w+) # Top-level domain, starts with a-z
\.? # Optional root domain dot
)
|
(?:\d{1,3}) # Or an IP address final term
)
(?::\d+)? # Optional port number
(?:
(?:/[^?\s]*)+ # URL path, anything non-?, non-ws separated by /
(?:\?\S*)? # Optional query string, anything after ?, up to #
)? # Make the whole path & query part optional
)
"""
Regex.scan regex, msg
end
def run_analyzer(msg, id, urls) when is_list urls do
if not Enum.empty? urls do
[url | rest] = urls
Kernel.spawn __MODULE__, :analyze_url, [msg, id, Enum.at(url, 0)]
id = if(id != nil, do: id + 1)
run_analyzer msg, id, rest
end
end
def analyze_url(msg, id, url) when is_binary url do
id_str = ""
size_str = ""
2013-06-27 20:56:27 +00:00
if id != nil do
id_str = "(" <> to_binary(id) <> ") "
2013-06-27 20:56:27 +00:00
end
analysis = analyze_url url
case analysis do
{status, domain, type, size, title} ->
result = id_str <> "[" <> domain <> "] "
if status == 200 do
if is_number(size) and size > 0 do
size_str = "(" <> human_bytes(size) <> ")"
end
result = result <> type <> " " <> size_str
IO.inspect result
IO.inspect title
if title != "" do
result = result <> " | " <> title
IO.inspect result
end
else
result = result <> "HTTP " <> to_binary status
end
{:error, error, domain} ->
result = id_str <> "[" <> domain <> "] " <>
case error do
:timeout -> "Timed out."
:no_scheme -> "No scheme."
:max_redirects -> "Too many redirects."
:blacklisted -> "Host blacklisted."
:unknown_method -> "Unknown HTTP method."
{:failed_connect, _} -> "Connection failed."
end
end
result_msg = Nulform.IRC.PRIVMSG.reply msg, result
:gen_server.cast msg.info.buffer, result_msg
end
def analyze_url(url) do
analyze_url url, 0
end
def analyze_url(url, redirects) do
analyze_url url, redirects, :head
2013-06-27 20:56:27 +00:00
end
def analyze_url(url, redirects, mode) when redirects > @max_redirects do
{:error, :max_redirects, URI.parse(url).authority}
end
2013-06-27 20:56:27 +00:00
def analyze_url(url, redirects, mode) do
IO.puts "Analyzing " <> url <> " round " <> to_binary redirects
title = ""
if String.starts_with? url, "www." do
url = "http://" <> url
end
# Strip anchor
url = Enum.at String.split(url, "#"), 0
domain = URI.parse(url).authority
case match_blacklist URI.parse(url).host do
true -> {:error, :blacklisted, domain}
false ->
result =
case mode do
:head -> http_head binary_to_list(url)
:get -> http_get binary_to_list(url)
_ -> {:error, :unknown_method, domain} # fail
end
case result do
{:ok, {{_, status, _}, headers, body}} ->
content_type = parse_content_type headers['content-type']
if status == 301 or status == 302 or status == 303 or status == 307 do
new_url = to_binary headers['location']
# Fix non-absolute location URIs by retard webdevs
if not String.starts_with? String.downcase(new_url), ["http://", "https://"] do
IO.puts "Fixing " <> new_url <> " to..."
new_url = URI.parse(url).scheme <> "://" <> domain <> "/" <> String.lstrip new_url, "/"
IO.inspect new_url
end
analyze_url new_url, redirects + 1, mode
else
if mode != :get and (status != 200 or Enum.any? @parseable_types, fn(x) -> x == content_type end) do
analyze_url url, redirects + 1, :get
else
IO.inspect mode
IO.inspect status
IO.inspect content_type
if mode == :get and status == 200 and Enum.any? @parseable_types, fn(x) -> x == content_type end do
title = parse_title body
IO.inspect title
end
IO.inspect Nulform.Utilities.to_utf8(title)
content_length = to_binary headers['content-length']
if content_length != "" do
content_length = binary_to_integer content_length
end
{
status,
domain,
content_type,
content_length,
Nulform.Utilities.to_utf8 title
}
end
end
{:error, error} -> {:error, error, domain}
end
2013-06-27 20:56:27 +00:00
end
end
def match_blacklist(domain) do
Enum.any? @domain_blacklist, fn x -> Regex.match? x, domain end
end
def parse_title(html) do
regex = %R@<title.*?>([^>]*?)<\s*?/\s*?title\s*?>@i
title = Regex.scan regex, html
if not Enum.empty? title do
title = Enum.at Enum.at(title, 0), 0
Regex.replace %R/\s+/, title, " "
else
""
end
end
def http_head(url) do
http_req :head, url
end
def http_get(url) do
http_req :get, url
end
def http_req(mode, url) do
:httpc.request mode, {url, @headers}, @http_options, @base_options
end
def human_bytes(size) do
human_bytes size, 1000
end
def human_bytes(size, factor) do
human_bytes size, factor, 2
end
def human_bytes(size, factor, decimals) do
human_bytes size, factor, decimals, ["B", "kB", "MB", "GB", "TP", "PB"]
end
def human_bytes(size, factor, decimals, [ext | rest]) when size >= factor do
human_bytes size / factor, factor, decimals, rest
end
2013-06-27 20:56:27 +00:00
def human_bytes(size, factor, decimals, [ext | rest]) do
float_to_binary(:erlang.float(size), [decimals: decimals]) <> " " <> ext
2013-06-27 20:56:27 +00:00
end
defp parse_content_type(header) do
Enum.at String.split(to_binary(header), ";"), 0
2013-06-27 20:56:27 +00:00
end
end