Add IP support and a blacklist for domains/IPs

This commit is contained in:
Mikko Ahlroth 2013-08-03 23:25:43 +03:00
parent 2aa388e901
commit 2d8dcdae34

View file

@ -22,6 +22,9 @@ defmodule Nulform.Plugins.URLAnalyzer do
@parseable_types [
"text/html", "text/xml", "application/xml", "application/xhtml+xml"
]
@domain_blacklist [
%R/(192|127|10)\.\d{1,3}\.\d{1,3}\.\d{1,3}/, %R/localhost/i
]
def init(nil) do
# If these fail, let them fail, we'll crash later
@ -54,11 +57,15 @@ defmodule Nulform.Plugins.URLAnalyzer do
(?:
(?:\w+\-)*\w+\.
)+ # Hostname parts, \w separated by - and . at the end
(?:
(?:[a-z]+(?:\w+\-)*\w+) # Top-level domain, starts with a-z
\.? # Optional root domain dot
(?::\d+)? # Optional port number
(
(?:
(?:[a-z]+(?:\w+\-)*\w+) # Top-level domain, starts with a-z
\.? # Optional root domain dot
)
|
(?:\d{1,3}) # Or an IP address final term
)
(?::\d+)? # Optional port number
(?:
(?:/[^?\s]*)+ # URL path, anything non-?, non-ws separated by /
(?:\?(?:\S*))? # Optional query string, anything after ?
@ -86,7 +93,7 @@ defmodule Nulform.Plugins.URLAnalyzer do
id_str = "(" <> to_binary(id) <> ") "
end
analysis = binary_to_list(url) |> analyze_url
analysis = analyze_url url
case analysis do
{status, domain, type, size, title} ->
@ -109,13 +116,15 @@ defmodule Nulform.Plugins.URLAnalyzer do
end
{:error, error, domain} ->
result = id_str <> "[" <> domain <> "] "
case error do
:timeout -> result = result <> "Timed out."
:no_scheme -> result = result <> "No scheme."
:max_redirects -> result = result <> "Too many redirects."
{:failed_connect, _} -> result = result <> "Connection failed."
end
result = id_str <> "[" <> domain <> "] " <>
case error do
:timeout -> "Timed out."
:no_scheme -> "No scheme."
:max_redirects -> "Too many redirects."
:blacklisted -> "Host blacklisted."
:unknown_method -> "Unknown HTTP method."
{:failed_connect, _} -> "Connection failed."
end
end
result_msg = Nulform.IRC.PRIVMSG.reply msg, result
@ -131,66 +140,74 @@ defmodule Nulform.Plugins.URLAnalyzer do
end
def analyze_url(url, redirects, mode) when redirects > @max_redirects do
{:error, :max_redirects, URI.parse(to_binary url).authority}
{:error, :max_redirects, URI.parse(url).authority}
end
def analyze_url(url, redirects, mode) do
IO.puts "Analyzing " <> to_binary(url) <> " round " <> to_binary redirects
IO.puts "Analyzing " <> url <> " round " <> to_binary redirects
title = ""
domain = URI.parse(to_binary url).authority
domain = URI.parse(url).authority
result =
case mode do
:head -> http_head url
:get -> http_get url
_ -> {:error, :unknown_method} # fail
end
case result do
{:ok, {{_, status, _}, headers, body}} ->
content_type = parse_content_type headers['content-type']
if status == 301 or status == 302 or status == 303 or status == 307 do
new_url = to_binary headers['location']
# Fix non-absolute location URIs by retard webdevs
if not String.starts_with? String.downcase(new_url), ["http://", "https://"] do
IO.puts "Fixing " <> new_url <> " to..."
new_url = URI.parse(to_binary url).scheme <> "://" <> domain <> "/" <> String.lstrip new_url, "/"
IO.inspect new_url
case match_blacklist URI.parse(url).host do
true -> {:error, :blacklisted, domain}
false ->
result =
case mode do
:head -> http_head binary_to_list(url)
:get -> http_get binary_to_list(url)
_ -> {:error, :unknown_method, domain} # fail
end
analyze_url binary_to_list(new_url), redirects + 1, mode
else
if mode != :get and (status != 200 or Enum.any? @parseable_types, fn(x) -> x == content_type end) do
analyze_url url, redirects + 1, :get
else
IO.inspect mode
IO.inspect status
IO.inspect content_type
if mode == :get and status == 200 and Enum.any? @parseable_types, fn(x) -> x == content_type end do
title = parse_title body
IO.inspect title
end
IO.inspect Nulform.Utilities.to_utf8(title)
content_length = to_binary headers['content-length']
if content_length != "" do
content_length = binary_to_integer content_length
case result do
{:ok, {{_, status, _}, headers, body}} ->
content_type = parse_content_type headers['content-type']
if status == 301 or status == 302 or status == 303 or status == 307 do
new_url = to_binary headers['location']
# Fix non-absolute location URIs by retard webdevs
if not String.starts_with? String.downcase(new_url), ["http://", "https://"] do
IO.puts "Fixing " <> new_url <> " to..."
new_url = URI.parse(url).scheme <> "://" <> domain <> "/" <> String.lstrip new_url, "/"
IO.inspect new_url
end
analyze_url new_url, redirects + 1, mode
else
if mode != :get and (status != 200 or Enum.any? @parseable_types, fn(x) -> x == content_type end) do
analyze_url url, redirects + 1, :get
else
IO.inspect mode
IO.inspect status
IO.inspect content_type
if mode == :get and status == 200 and Enum.any? @parseable_types, fn(x) -> x == content_type end do
title = parse_title body
IO.inspect title
end
IO.inspect Nulform.Utilities.to_utf8(title)
content_length = to_binary headers['content-length']
if content_length != "" do
content_length = binary_to_integer content_length
end
{
status,
domain,
content_type,
content_length,
Nulform.Utilities.to_utf8 title
}
end
end
{
status,
domain,
content_type,
content_length,
Nulform.Utilities.to_utf8 title
}
end
{:error, error} -> {:error, error, domain}
end
{:error, error} -> {:error, error, domain}
end
end
def match_blacklist(domain) do
Enum.any? @domain_blacklist, fn x -> Regex.match? x, domain end
end
def parse_title(html) do
regex = %R@<title.*?>([^>]*?)<\s*?/\s*?title\s*?>@i
title = Regex.scan regex, html