URL analyzer rewritten as GenServer, works pretty well. Added utilities
This commit is contained in:
parent
14830d9c99
commit
c5e925b214
2 changed files with 176 additions and 23 deletions
|
@ -1,4 +1,5 @@
|
|||
defmodule Nulform.Plugins.URLAnalyzer do
|
||||
use GenServer.Behaviour
|
||||
@moduledoc """
|
||||
This is an example plugin which analyzes URLs on IRC. It scans incoming
|
||||
IRC messages for URLs and analyzes them, returning data about them.
|
||||
|
@ -12,40 +13,160 @@ defmodule Nulform.Plugins.URLAnalyzer do
|
|||
* Try to find <title> contents from body and return it with other data
|
||||
if found.
|
||||
"""
|
||||
@timeout 10000
|
||||
@base_options [body_format: :binary, sync: true]
|
||||
@http_options [timeout: @timeout, autoredirect: false]
|
||||
# We need a real user agent since some sites fail on nonstandard ones
|
||||
@headers [{'user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36'}]
|
||||
@max_redirects 10
|
||||
|
||||
def run() do
|
||||
run nil
|
||||
def init() do
|
||||
:ok = :inets.start()
|
||||
{:ok}
|
||||
end
|
||||
|
||||
def run(parent) do
|
||||
receive do
|
||||
{:nulform, :set_parent, parent} ->
|
||||
:ok = :inets.start()
|
||||
|
||||
{:nulform, :urlanalyze, url, id} ->
|
||||
Kernel.spawn __MODULE__, :run_analyzer, [parent, Kernel.binary_to_list url]
|
||||
def handle_cast(msg) do
|
||||
case String.split msg.raw_msg do
|
||||
[_, "PRIVMSG" | _] ->
|
||||
urls = parse_urls msg.raw_msg
|
||||
run_analyzer msg.buffer, nil, urls
|
||||
end
|
||||
|
||||
run parent
|
||||
{:noreply}
|
||||
end
|
||||
|
||||
def run_analyzer(parent, url) do
|
||||
{:ok, {{_, status, _}, headers, body}} = :httpc.request :head, {url, []}, [], []
|
||||
[content_type | _] = String.split to_binary(headers['content-type']), ";"
|
||||
def parse_urls(msg) do
|
||||
regex = %R"""ix
|
||||
(
|
||||
(?:
|
||||
(?:https?://)|(?:www\.) # Catch http://, https:// and www.
|
||||
)
|
||||
(?:
|
||||
(?:\w+\-)*\w+\.
|
||||
)+ # Hostname parts, \w separated by - and . at the end
|
||||
(?:
|
||||
(?:[a-z]+(?:\w+\-)*\w+) # Top-level domain, starts with a-z
|
||||
\.? # Optional root domain dot
|
||||
(?::\d+)? # Optional port number
|
||||
)
|
||||
(?:
|
||||
(?:/[^?\s]*)+ # URL path, anything non-?, non-ws separated by /
|
||||
(?:\?(?:\S*))? # Optional query string, anything after ?
|
||||
)? # Make the whole path & query part optional
|
||||
)
|
||||
"""
|
||||
|
||||
if status != 200 do
|
||||
result = {status, content_type, headers['content-length'], body}
|
||||
Regex.scan regex, msg
|
||||
end
|
||||
|
||||
def run_analyzer(parent, id, urls) when is_list urls do
|
||||
if not Enum.empty? urls do
|
||||
[url | rest] = urls
|
||||
Kernel.spawn __MODULE__, :analyze_url, [parent, elem(url, 0)]
|
||||
run_analyzer parent, id, rest
|
||||
end
|
||||
end
|
||||
|
||||
def analyze_url(parent, id, url) when is_binary url do
|
||||
id_str = ""
|
||||
size_str = ""
|
||||
|
||||
{status, type, size, title} = binary_to_list url |> analyze_url
|
||||
if id != nil do
|
||||
id_str = "(" <> id <> ") "
|
||||
end
|
||||
|
||||
if is_number size and size > 0 do
|
||||
size_str = "(" <> human_bytes(size) <> ")"
|
||||
end
|
||||
|
||||
result = id_str
|
||||
end
|
||||
|
||||
def analyze_url(url) when is_list url do
|
||||
analyze_url url, 0
|
||||
end
|
||||
|
||||
def analyze_url(url, redirects) when redirects > @max_redirects do
|
||||
nil
|
||||
end
|
||||
|
||||
def analyze_url(url, redirects) when is_list url do
|
||||
title = nil
|
||||
{:ok, {{_, status, _}, headers, body}} = http_head url
|
||||
|
||||
content_type = parse_content_type headers['content-type']
|
||||
|
||||
if status == 301 or status == 302 or status == 303 or status == 307 do
|
||||
analyze_url(headers['location'], redirects + 1)
|
||||
else
|
||||
if content_type == "text/html" do
|
||||
{:ok, {{_, status, _}, headers, body}} = :httpc.request url
|
||||
end
|
||||
result = {status, content_type, headers['content_length'], body}
|
||||
end
|
||||
#if (status == 200 and content_type == "text/html") do
|
||||
{:ok, {{_, status, _}, headers, body}} = http_get url
|
||||
title = parse_title body
|
||||
content_type = parse_content_type headers['content-type']
|
||||
#end
|
||||
|
||||
parent <- result
|
||||
content_length = to_binary headers['content-length']
|
||||
if content_length != "" do
|
||||
content_length = binary_to_integer content_length
|
||||
end
|
||||
|
||||
domain = URI.parse(to_binary url).authority
|
||||
|
||||
[
|
||||
status: status,
|
||||
domain: domain,
|
||||
type: content_type,
|
||||
size: content_length,
|
||||
title: Nulform.Utilities.to_utf8(to_binary title)
|
||||
]
|
||||
end
|
||||
end
|
||||
|
||||
def find_urls(message) do
|
||||
[]
|
||||
def parse_title(html) do
|
||||
regex = %R@<title.*?>([^>]*?)<\s*?/\s*?title\s*?>@i
|
||||
title = Regex.scan regex, html
|
||||
|
||||
if not Enum.empty? title do
|
||||
Enum.at Enum.at(title, 0), 0
|
||||
else
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
def http_head(url) do
|
||||
http_req :head, url
|
||||
end
|
||||
|
||||
def http_get(url) do
|
||||
http_req :get, url
|
||||
end
|
||||
|
||||
def http_req(mode, url) do
|
||||
:httpc.request mode, {url, @headers}, @http_options, @base_options
|
||||
end
|
||||
|
||||
def human_bytes(size) do
|
||||
human_bytes size, 1000
|
||||
end
|
||||
|
||||
def human_bytes(size, factor) do
|
||||
human_bytes size, factor, 2
|
||||
end
|
||||
|
||||
def human_bytes(size, factor, decimals) do
|
||||
human_bytes size, factor, decimals, ["B", "kB", "MB", "GB", "TP", "PB"]
|
||||
end
|
||||
|
||||
def human_bytes(size, factor, decimals, [ext | rest]) when size >= factor do
|
||||
human_bytes size / factor, factor, decimals, rest
|
||||
end
|
||||
|
||||
def human_bytes(size, factor, decimals, [ext | rest]) do
|
||||
float_to_binary(size, [decimals: decimals]) <> " " <> ext
|
||||
end
|
||||
|
||||
defp parse_content_type(header) do
|
||||
Enum.at String.split(to_binary(header), ";"), 0
|
||||
end
|
||||
end
|
||||
|
|
32
lib/nulform/utilities.ex
Normal file
32
lib/nulform/utilities.ex
Normal file
|
@ -0,0 +1,32 @@
|
|||
defmodule Nulform.Utilities do
|
||||
@moduledoc """
|
||||
This module contains utilities for the bot that don't really go anywhere
|
||||
else.
|
||||
"""
|
||||
|
||||
@doc """
|
||||
Convert a string to UTF-8, trying valiantly to maintain the original
|
||||
content. There may be no way to really know the encoding of the input, so we
|
||||
will first try it as UTF-8 and then as latin-1 if it was not valid.
|
||||
Unfortunately there is *no* feasible way to guess between latin-1 and all of
|
||||
the other 8-bit encodings if we don't resort to smelly heuristics, so
|
||||
latin-1 is all we will support.
|
||||
|
||||
Don't you just love character encodings?
|
||||
"""
|
||||
def to_utf8(string) do
|
||||
if String.valid? string do
|
||||
string
|
||||
else
|
||||
latin1_to_utf8 string
|
||||
end
|
||||
end
|
||||
|
||||
@doc """
|
||||
Convert a latin-1 string to UTF-8. Will give the wrong results if the
|
||||
original string is not actually latin-1.
|
||||
"""
|
||||
def latin1_to_utf8(string) do
|
||||
:unicode.characters_to_binary string, :latin1, :utf8
|
||||
end
|
||||
end
|
Loading…
Reference in a new issue