265 lines
9.3 KiB
Elixir
265 lines
9.3 KiB
Elixir
defmodule Nulform.Plugins.URLAnalyzer do
|
|
use GenServer.Behaviour
|
|
@moduledoc """
|
|
This is an example plugin which analyzes URLs on IRC. It scans incoming
|
|
IRC messages for URLs and analyzes them, returning data about them.
|
|
|
|
The algorithm for analyzing URLs is as follows:
|
|
* Send HEAD request to URL. Store content-length, content-type, and
|
|
HTTP code.
|
|
* If an error happened, return the collected data.
|
|
* If request succeeded and content-type is text/html, issue a GET
|
|
request to the URL.
|
|
* Try to find <title> contents from body and return it with other data
|
|
if found.
|
|
"""
|
|
@timeout 10000
|
|
@base_options [body_format: :binary, sync: true]
|
|
@http_options [timeout: @timeout, autoredirect: false]
|
|
# We need a real user agent since some sites fail on nonstandard ones
|
|
@headers [{'user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'}]
|
|
@max_redirects 10
|
|
@parseable_types [
|
|
"text/html", "text/xml", "application/xml", "application/xhtml+xml"
|
|
]
|
|
@domain_blacklist [
|
|
%R/(192|127|10)\.\d{1,3}\.\d{1,3}\.\d{1,3}/, %R/localhost/i
|
|
]
|
|
|
|
def init(nil) do
|
|
# If these fail, let them fail, we'll crash later
|
|
:inets.start()
|
|
:ssl.start()
|
|
{:ok, nil}
|
|
end
|
|
|
|
def handle_cast(msg, nil) do
|
|
IO.inspect msg
|
|
in_msg = Nulform.IRC.PRIVMSG.parse msg
|
|
|
|
if in_msg != nil do
|
|
IO.puts "Message parsed!"
|
|
urls = parse_urls in_msg.text
|
|
IO.inspect urls
|
|
id = if(Enum.count(urls) > 1, do: 0)
|
|
run_analyzer in_msg, id, urls
|
|
end
|
|
|
|
{:noreply, nil}
|
|
end
|
|
|
|
def parse_urls(msg) do
|
|
regex = %R"""ix
|
|
(
|
|
(?:
|
|
(?:https?://)|(?:www\.) # Catch http://, https:// and www.
|
|
)
|
|
(?:
|
|
(?:\w+\-)*\w+\.
|
|
)+ # Hostname parts, \w separated by - and . at the end
|
|
(?:
|
|
(?:
|
|
(?:[a-z]+(?:\w+\-)*\w+) # Top-level domain, starts with a-z
|
|
\.? # Optional root domain dot
|
|
)
|
|
|
|
|
(?:\d{1,3}) # Or an IP address final term
|
|
)
|
|
(?::\d+)? # Optional port number
|
|
(?:
|
|
(?:/[^?\s]*)+ # URL path, anything non-?, non-ws separated by /
|
|
(?:\?\S*)? # Optional query string, anything after ?, up to #
|
|
)? # Make the whole path & query part optional
|
|
)
|
|
"""
|
|
|
|
Regex.scan regex, msg
|
|
end
|
|
|
|
def run_analyzer(msg, id, urls) when is_list urls do
|
|
if not Enum.empty? urls do
|
|
[url | rest] = urls
|
|
Kernel.spawn __MODULE__, :analyze_url, [msg, id, Enum.at(url, 0)]
|
|
id = if(id != nil, do: id + 1)
|
|
run_analyzer msg, id, rest
|
|
end
|
|
end
|
|
|
|
def analyze_url(msg, id, url) when is_binary url do
|
|
id_str = ""
|
|
size_str = ""
|
|
|
|
if id != nil do
|
|
id_str = "(" <> to_binary(id) <> ") "
|
|
end
|
|
|
|
analysis = analyze_url url
|
|
|
|
case analysis do
|
|
{status, domain, type, size, title} ->
|
|
result = id_str <> "[" <> domain <> "] "
|
|
if status == 200 do
|
|
if is_number(size) and size > 0 do
|
|
size_str = "(" <> human_bytes(size) <> ")"
|
|
end
|
|
|
|
result = result <> type <> " " <> size_str
|
|
|
|
IO.inspect result
|
|
IO.inspect title
|
|
if title != "" do
|
|
result = result <> " | " <> title
|
|
IO.inspect result
|
|
end
|
|
else
|
|
result = result <> "HTTP " <> to_binary status
|
|
end
|
|
|
|
{:error, error, domain} ->
|
|
result = id_str <> "[" <> domain <> "] " <>
|
|
case error do
|
|
:timeout -> "Timed out."
|
|
:no_scheme -> "No scheme."
|
|
:max_redirects -> "Too many redirects."
|
|
:blacklisted -> "Host blacklisted."
|
|
:unknown_method -> "Unknown HTTP method."
|
|
{:failed_connect, _} -> "Connection failed."
|
|
end
|
|
end
|
|
|
|
result_msg = Nulform.IRC.PRIVMSG.reply msg, result
|
|
:gen_server.cast msg.info.buffer, result_msg
|
|
end
|
|
|
|
def analyze_url(url) do
|
|
analyze_url url, 0
|
|
end
|
|
|
|
def analyze_url(url, redirects) do
|
|
analyze_url url, redirects, :head
|
|
end
|
|
|
|
def analyze_url(url, redirects, mode) when redirects > @max_redirects do
|
|
{:error, :max_redirects, URI.parse(url).authority}
|
|
end
|
|
|
|
def analyze_url(url, redirects, mode) do
|
|
IO.puts "Analyzing " <> url <> " round " <> to_binary redirects
|
|
title = ""
|
|
|
|
if String.starts_with? url, "www." do
|
|
url = "http://" <> url
|
|
end
|
|
|
|
# Strip anchor
|
|
url = Enum.at String.split(url, "#"), 0
|
|
|
|
domain = URI.parse(url).authority
|
|
case match_blacklist URI.parse(url).host do
|
|
true -> {:error, :blacklisted, domain}
|
|
false ->
|
|
result =
|
|
case mode do
|
|
:head -> http_head binary_to_list(url)
|
|
:get -> http_get binary_to_list(url)
|
|
_ -> {:error, :unknown_method, domain} # fail
|
|
end
|
|
|
|
case result do
|
|
{:ok, {{_, status, _}, headers, body}} ->
|
|
content_type = parse_content_type headers['content-type']
|
|
|
|
if status == 301 or status == 302 or status == 303 or status == 307 do
|
|
new_url = to_binary headers['location']
|
|
# Fix non-absolute location URIs by retard webdevs
|
|
if not String.starts_with? String.downcase(new_url), ["http://", "https://"] do
|
|
IO.puts "Fixing " <> new_url <> " to..."
|
|
new_url = URI.parse(url).scheme <> "://" <> domain <> "/" <> String.lstrip new_url, "/"
|
|
IO.inspect new_url
|
|
end
|
|
analyze_url new_url, redirects + 1, mode
|
|
else
|
|
if mode != :get and (status != 200 or Enum.any? @parseable_types, fn(x) -> x == content_type end) do
|
|
analyze_url url, redirects + 1, :get
|
|
else
|
|
IO.inspect mode
|
|
IO.inspect status
|
|
IO.inspect content_type
|
|
if mode == :get and status == 200 and Enum.any? @parseable_types, fn(x) -> x == content_type end do
|
|
title = parse_title body
|
|
IO.inspect title
|
|
end
|
|
IO.inspect Nulform.Utilities.to_utf8(title)
|
|
|
|
content_length = to_binary headers['content-length']
|
|
if content_length != "" do
|
|
content_length = binary_to_integer content_length
|
|
end
|
|
|
|
{
|
|
status,
|
|
domain,
|
|
content_type,
|
|
content_length,
|
|
Nulform.Utilities.to_utf8 title
|
|
}
|
|
end
|
|
end
|
|
|
|
{:error, error} -> {:error, error, domain}
|
|
end
|
|
end
|
|
end
|
|
|
|
def match_blacklist(domain) do
|
|
Enum.any? @domain_blacklist, fn x -> Regex.match? x, domain end
|
|
end
|
|
|
|
def parse_title(html) do
|
|
regex = %R@<title.*?>([^>]*?)<\s*?/\s*?title\s*?>@i
|
|
title = Regex.scan regex, html
|
|
|
|
if not Enum.empty? title do
|
|
title = Enum.at Enum.at(title, 0), 0
|
|
Regex.replace %R/\s+/, title, " "
|
|
else
|
|
""
|
|
end
|
|
end
|
|
|
|
def http_head(url) do
|
|
http_req :head, url
|
|
end
|
|
|
|
def http_get(url) do
|
|
http_req :get, url
|
|
end
|
|
|
|
def http_req(mode, url) do
|
|
:httpc.request mode, {url, @headers}, @http_options, @base_options
|
|
end
|
|
|
|
def human_bytes(size) do
|
|
human_bytes size, 1000
|
|
end
|
|
|
|
def human_bytes(size, factor) do
|
|
human_bytes size, factor, 2
|
|
end
|
|
|
|
def human_bytes(size, factor, decimals) do
|
|
human_bytes size, factor, decimals, ["B", "kB", "MB", "GB", "TP", "PB"]
|
|
end
|
|
|
|
def human_bytes(size, factor, decimals, [ext | rest]) when size >= factor do
|
|
human_bytes size / factor, factor, decimals, rest
|
|
end
|
|
|
|
def human_bytes(size, factor, decimals, [ext | rest]) do
|
|
float_to_binary(:erlang.float(size), [decimals: decimals]) <> " " <> ext
|
|
end
|
|
|
|
defp parse_content_type(header) do
|
|
Enum.at String.split(to_binary(header), ";"), 0
|
|
end
|
|
end
|