URLAnalyzer now works most of the time, still need to fix a few hanging bugs

This commit is contained in:
Mikko Ahlroth 2013-08-03 00:40:05 +03:00
parent 455204cf7c
commit f70404a3c4

View file

@ -19,20 +19,30 @@ defmodule Nulform.Plugins.URLAnalyzer do
# We need a real user agent since some sites fail on nonstandard ones # We need a real user agent since some sites fail on nonstandard ones
@headers [{'user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'}] @headers [{'user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'}]
@max_redirects 10 @max_redirects 10
@parseable_types [
"text/html", "text/xml", "application/xml", "application/xhtml+xml"
]
def init() do def init(nil) do
:ok = :inets.start() # If these fail, let them fail, we'll crash later
{:ok} :inets.start()
:ssl.start()
{:ok, nil}
end end
def handle_cast(msg) do def handle_cast(msg, nil) do
case String.split msg.raw_msg do IO.inspect msg
[_, "PRIVMSG" | _] -> in_msg = Nulform.IRC.PRIVMSG.parse msg
urls = parse_urls msg.raw_msg
run_analyzer msg.buffer, nil, urls if in_msg != nil do
IO.puts "Message parsed!"
urls = parse_urls in_msg.text
IO.inspect urls
id = if(Enum.count(urls) > 1, do: 0)
run_analyzer in_msg, id, urls
end end
{:noreply} {:noreply, nil}
end end
def parse_urls(msg) do def parse_urls(msg) do
@ -59,67 +69,125 @@ defmodule Nulform.Plugins.URLAnalyzer do
Regex.scan regex, msg Regex.scan regex, msg
end end
def run_analyzer(parent, id, urls) when is_list urls do def run_analyzer(msg, id, urls) when is_list urls do
if not Enum.empty? urls do if not Enum.empty? urls do
[url | rest] = urls [url | rest] = urls
Kernel.spawn __MODULE__, :analyze_url, [parent, elem(url, 0)] Kernel.spawn __MODULE__, :analyze_url, [msg, id, Enum.at(url, 0)]
run_analyzer parent, id, rest id = if(id != nil, do: id + 1)
run_analyzer msg, id, rest
end end
end end
def analyze_url(parent, id, url) when is_binary url do def analyze_url(msg, id, url) when is_binary url do
id_str = "" id_str = ""
size_str = "" size_str = ""
{status, type, size, title} = binary_to_list url |> analyze_url
if id != nil do if id != nil do
id_str = "(" <> id <> ") " id_str = "(" <> to_binary(id) <> ") "
end end
if is_number size and size > 0 do analysis = binary_to_list(url) |> analyze_url
size_str = "(" <> human_bytes(size) <> ")"
case analysis do
{status, domain, type, size, title} ->
result = id_str <> "[" <> domain <> "] "
if status == 200 do
if is_number(size) and size > 0 do
size_str = "(" <> human_bytes(size) <> ")"
end
result = result <> type <> " " <> size_str
IO.inspect result
IO.inspect title
if title != "" do
result = result <> " | " <> title
IO.inspect result
end
else
result = result <> "HTTP " <> to_binary status
end
{:error, error, domain} ->
result = id_str <> "[" <> domain <> "] "
case error do
:timeout -> result = result <> "Timed out."
:no_scheme -> result = result <> "No scheme."
:max_redirects -> result = result <> "Too many redirects."
{:failed_connect, _} -> result = result <> "Connection failed."
end
end end
result = id_str result_msg = Nulform.IRC.PRIVMSG.reply msg, result
:gen_server.cast msg.info.buffer, result_msg
end end
def analyze_url(url) when is_list url do def analyze_url(url) do
analyze_url url, 0 analyze_url url, 0
end end
def analyze_url(url, redirects) when redirects > @max_redirects do def analyze_url(url, redirects) do
nil analyze_url url, redirects, :head
end end
def analyze_url(url, redirects) when is_list url do def analyze_url(url, redirects, mode) when redirects > @max_redirects do
title = nil {:error, :max_redirects, URI.parse(to_binary url).authority}
{:ok, {{_, status, _}, headers, body}} = http_head url end
content_type = parse_content_type headers['content-type'] def analyze_url(url, redirects, mode) do
IO.puts "Analyzing " <> to_binary(url) <> " round " <> to_binary redirects
title = ""
domain = URI.parse(to_binary url).authority
if status == 301 or status == 302 or status == 303 or status == 307 do result =
analyze_url(headers['location'], redirects + 1) case mode do
else :head -> http_head url
#if (status == 200 and content_type == "text/html") do :get -> http_get url
{:ok, {{_, status, _}, headers, body}} = http_get url _ -> {:error, :unknown_method} # fail
title = parse_title body
content_type = parse_content_type headers['content-type']
#end
content_length = to_binary headers['content-length']
if content_length != "" do
content_length = binary_to_integer content_length
end end
domain = URI.parse(to_binary url).authority case result do
{:ok, {{_, status, _}, headers, body}} ->
content_type = parse_content_type headers['content-type']
[ if status == 301 or status == 302 or status == 303 or status == 307 do
status: status, new_url = to_binary headers['location']
domain: domain, # Fix non-absolute location URIs by retard webdevs
type: content_type, if not String.starts_with? String.downcase(new_url), ["http://", "https://"] do
size: content_length, IO.puts "Fixing " <> new_url <> " to..."
title: Nulform.Utilities.to_utf8(to_binary title) new_url = URI.parse(to_binary url).scheme <> "://" <> domain <> "/" <> String.lstrip new_url, "/"
] IO.inspect new_url
end
analyze_url binary_to_list(new_url), redirects + 1, mode
else
if mode != :get and (status != 200 or Enum.any? @parseable_types, fn(x) -> x == content_type end) do
analyze_url url, redirects + 1, :get
else
IO.inspect mode
IO.inspect status
IO.inspect content_type
if mode == :get and status == 200 and Enum.any? @parseable_types, fn(x) -> x == content_type end do
title = parse_title body
IO.inspect title
end
IO.inspect Nulform.Utilities.to_utf8(title)
content_length = to_binary headers['content-length']
if content_length != "" do
content_length = binary_to_integer content_length
end
{
status,
domain,
content_type,
content_length,
Nulform.Utilities.to_utf8 title
}
end
end
{:error, error} -> {:error, error, domain}
end end
end end
@ -128,9 +196,10 @@ defmodule Nulform.Plugins.URLAnalyzer do
title = Regex.scan regex, html title = Regex.scan regex, html
if not Enum.empty? title do if not Enum.empty? title do
Enum.at Enum.at(title, 0), 0 title = Enum.at Enum.at(title, 0), 0
Enum.join String.split(title), " "
else else
nil ""
end end
end end
@ -163,7 +232,7 @@ defmodule Nulform.Plugins.URLAnalyzer do
end end
def human_bytes(size, factor, decimals, [ext | rest]) do def human_bytes(size, factor, decimals, [ext | rest]) do
float_to_binary(size, [decimals: decimals]) <> " " <> ext float_to_binary(:erlang.float(size), [decimals: decimals]) <> " " <> ext
end end
defp parse_content_type(header) do defp parse_content_type(header) do