URLAnalyzer now works most of the time, still need to fix a few hanging bugs
This commit is contained in:
parent
455204cf7c
commit
f70404a3c4
1 changed files with 117 additions and 48 deletions
|
@ -19,20 +19,30 @@ defmodule Nulform.Plugins.URLAnalyzer do
|
|||
# We need a real user agent since some sites fail on nonstandard ones
|
||||
@headers [{'user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'}]
|
||||
@max_redirects 10
|
||||
@parseable_types [
|
||||
"text/html", "text/xml", "application/xml", "application/xhtml+xml"
|
||||
]
|
||||
|
||||
def init() do
|
||||
:ok = :inets.start()
|
||||
{:ok}
|
||||
def init(nil) do
|
||||
# If these fail, let them fail, we'll crash later
|
||||
:inets.start()
|
||||
:ssl.start()
|
||||
{:ok, nil}
|
||||
end
|
||||
|
||||
def handle_cast(msg) do
|
||||
case String.split msg.raw_msg do
|
||||
[_, "PRIVMSG" | _] ->
|
||||
urls = parse_urls msg.raw_msg
|
||||
run_analyzer msg.buffer, nil, urls
|
||||
def handle_cast(msg, nil) do
|
||||
IO.inspect msg
|
||||
in_msg = Nulform.IRC.PRIVMSG.parse msg
|
||||
|
||||
if in_msg != nil do
|
||||
IO.puts "Message parsed!"
|
||||
urls = parse_urls in_msg.text
|
||||
IO.inspect urls
|
||||
id = if(Enum.count(urls) > 1, do: 0)
|
||||
run_analyzer in_msg, id, urls
|
||||
end
|
||||
|
||||
{:noreply}
|
||||
{:noreply, nil}
|
||||
end
|
||||
|
||||
def parse_urls(msg) do
|
||||
|
@ -59,67 +69,125 @@ defmodule Nulform.Plugins.URLAnalyzer do
|
|||
Regex.scan regex, msg
|
||||
end
|
||||
|
||||
def run_analyzer(parent, id, urls) when is_list urls do
|
||||
def run_analyzer(msg, id, urls) when is_list urls do
|
||||
if not Enum.empty? urls do
|
||||
[url | rest] = urls
|
||||
Kernel.spawn __MODULE__, :analyze_url, [parent, elem(url, 0)]
|
||||
run_analyzer parent, id, rest
|
||||
Kernel.spawn __MODULE__, :analyze_url, [msg, id, Enum.at(url, 0)]
|
||||
id = if(id != nil, do: id + 1)
|
||||
run_analyzer msg, id, rest
|
||||
end
|
||||
end
|
||||
|
||||
def analyze_url(parent, id, url) when is_binary url do
|
||||
def analyze_url(msg, id, url) when is_binary url do
|
||||
id_str = ""
|
||||
size_str = ""
|
||||
|
||||
{status, type, size, title} = binary_to_list url |> analyze_url
|
||||
if id != nil do
|
||||
id_str = "(" <> id <> ") "
|
||||
id_str = "(" <> to_binary(id) <> ") "
|
||||
end
|
||||
|
||||
if is_number size and size > 0 do
|
||||
size_str = "(" <> human_bytes(size) <> ")"
|
||||
analysis = binary_to_list(url) |> analyze_url
|
||||
|
||||
case analysis do
|
||||
{status, domain, type, size, title} ->
|
||||
result = id_str <> "[" <> domain <> "] "
|
||||
if status == 200 do
|
||||
if is_number(size) and size > 0 do
|
||||
size_str = "(" <> human_bytes(size) <> ")"
|
||||
end
|
||||
|
||||
result = result <> type <> " " <> size_str
|
||||
|
||||
IO.inspect result
|
||||
IO.inspect title
|
||||
if title != "" do
|
||||
result = result <> " | " <> title
|
||||
IO.inspect result
|
||||
end
|
||||
else
|
||||
result = result <> "HTTP " <> to_binary status
|
||||
end
|
||||
|
||||
{:error, error, domain} ->
|
||||
result = id_str <> "[" <> domain <> "] "
|
||||
case error do
|
||||
:timeout -> result = result <> "Timed out."
|
||||
:no_scheme -> result = result <> "No scheme."
|
||||
:max_redirects -> result = result <> "Too many redirects."
|
||||
{:failed_connect, _} -> result = result <> "Connection failed."
|
||||
end
|
||||
end
|
||||
|
||||
result = id_str
|
||||
result_msg = Nulform.IRC.PRIVMSG.reply msg, result
|
||||
:gen_server.cast msg.info.buffer, result_msg
|
||||
end
|
||||
|
||||
def analyze_url(url) when is_list url do
|
||||
def analyze_url(url) do
|
||||
analyze_url url, 0
|
||||
end
|
||||
|
||||
def analyze_url(url, redirects) when redirects > @max_redirects do
|
||||
nil
|
||||
def analyze_url(url, redirects) do
|
||||
analyze_url url, redirects, :head
|
||||
end
|
||||
|
||||
def analyze_url(url, redirects) when is_list url do
|
||||
title = nil
|
||||
{:ok, {{_, status, _}, headers, body}} = http_head url
|
||||
def analyze_url(url, redirects, mode) when redirects > @max_redirects do
|
||||
{:error, :max_redirects, URI.parse(to_binary url).authority}
|
||||
end
|
||||
|
||||
content_type = parse_content_type headers['content-type']
|
||||
def analyze_url(url, redirects, mode) do
|
||||
IO.puts "Analyzing " <> to_binary(url) <> " round " <> to_binary redirects
|
||||
title = ""
|
||||
domain = URI.parse(to_binary url).authority
|
||||
|
||||
if status == 301 or status == 302 or status == 303 or status == 307 do
|
||||
analyze_url(headers['location'], redirects + 1)
|
||||
else
|
||||
#if (status == 200 and content_type == "text/html") do
|
||||
{:ok, {{_, status, _}, headers, body}} = http_get url
|
||||
title = parse_title body
|
||||
content_type = parse_content_type headers['content-type']
|
||||
#end
|
||||
|
||||
content_length = to_binary headers['content-length']
|
||||
if content_length != "" do
|
||||
content_length = binary_to_integer content_length
|
||||
result =
|
||||
case mode do
|
||||
:head -> http_head url
|
||||
:get -> http_get url
|
||||
_ -> {:error, :unknown_method} # fail
|
||||
end
|
||||
|
||||
domain = URI.parse(to_binary url).authority
|
||||
case result do
|
||||
{:ok, {{_, status, _}, headers, body}} ->
|
||||
content_type = parse_content_type headers['content-type']
|
||||
|
||||
[
|
||||
status: status,
|
||||
domain: domain,
|
||||
type: content_type,
|
||||
size: content_length,
|
||||
title: Nulform.Utilities.to_utf8(to_binary title)
|
||||
]
|
||||
if status == 301 or status == 302 or status == 303 or status == 307 do
|
||||
new_url = to_binary headers['location']
|
||||
# Fix non-absolute location URIs by retard webdevs
|
||||
if not String.starts_with? String.downcase(new_url), ["http://", "https://"] do
|
||||
IO.puts "Fixing " <> new_url <> " to..."
|
||||
new_url = URI.parse(to_binary url).scheme <> "://" <> domain <> "/" <> String.lstrip new_url, "/"
|
||||
IO.inspect new_url
|
||||
end
|
||||
analyze_url binary_to_list(new_url), redirects + 1, mode
|
||||
else
|
||||
if mode != :get and (status != 200 or Enum.any? @parseable_types, fn(x) -> x == content_type end) do
|
||||
analyze_url url, redirects + 1, :get
|
||||
else
|
||||
IO.inspect mode
|
||||
IO.inspect status
|
||||
IO.inspect content_type
|
||||
if mode == :get and status == 200 and Enum.any? @parseable_types, fn(x) -> x == content_type end do
|
||||
title = parse_title body
|
||||
IO.inspect title
|
||||
end
|
||||
IO.inspect Nulform.Utilities.to_utf8(title)
|
||||
|
||||
content_length = to_binary headers['content-length']
|
||||
if content_length != "" do
|
||||
content_length = binary_to_integer content_length
|
||||
end
|
||||
|
||||
{
|
||||
status,
|
||||
domain,
|
||||
content_type,
|
||||
content_length,
|
||||
Nulform.Utilities.to_utf8 title
|
||||
}
|
||||
end
|
||||
end
|
||||
|
||||
{:error, error} -> {:error, error, domain}
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -128,9 +196,10 @@ defmodule Nulform.Plugins.URLAnalyzer do
|
|||
title = Regex.scan regex, html
|
||||
|
||||
if not Enum.empty? title do
|
||||
Enum.at Enum.at(title, 0), 0
|
||||
title = Enum.at Enum.at(title, 0), 0
|
||||
Enum.join String.split(title), " "
|
||||
else
|
||||
nil
|
||||
""
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -163,7 +232,7 @@ defmodule Nulform.Plugins.URLAnalyzer do
|
|||
end
|
||||
|
||||
def human_bytes(size, factor, decimals, [ext | rest]) do
|
||||
float_to_binary(size, [decimals: decimals]) <> " " <> ext
|
||||
float_to_binary(:erlang.float(size), [decimals: decimals]) <> " " <> ext
|
||||
end
|
||||
|
||||
defp parse_content_type(header) do
|
||||
|
|
Loading…
Reference in a new issue