Add www. support, cut url from #, clean titles with excess whitespace

This commit is contained in:
Mikko Ahlroth 2013-08-04 00:49:30 +03:00
parent 8e9120f88a
commit f1d7e8a753

View file

@ -68,8 +68,8 @@ defmodule Nulform.Plugins.URLAnalyzer do
(?::\d+)? # Optional port number (?::\d+)? # Optional port number
(?: (?:
(?:/[^?\s]*)+ # URL path, anything non-?, non-ws separated by / (?:/[^?\s]*)+ # URL path, anything non-?, non-ws separated by /
(?:\?(?:\S*))? # Optional query string, anything after ? (?:\?\S*)? # Optional query string, anything after ?, up to #
)? # Make the whole path & query part optional )? # Make the whole path & query part optional
) )
""" """
@ -146,8 +146,15 @@ defmodule Nulform.Plugins.URLAnalyzer do
def analyze_url(url, redirects, mode) do def analyze_url(url, redirects, mode) do
IO.puts "Analyzing " <> url <> " round " <> to_binary redirects IO.puts "Analyzing " <> url <> " round " <> to_binary redirects
title = "" title = ""
domain = URI.parse(url).authority
if String.starts_with? url, "www." do
url = "http://" <> url
end
# Strip anchor
url = Enum.at String.split(url, "#"), 0
domain = URI.parse(url).authority
case match_blacklist URI.parse(url).host do case match_blacklist URI.parse(url).host do
true -> {:error, :blacklisted, domain} true -> {:error, :blacklisted, domain}
false -> false ->
@ -214,7 +221,7 @@ defmodule Nulform.Plugins.URLAnalyzer do
if not Enum.empty? title do if not Enum.empty? title do
title = Enum.at Enum.at(title, 0), 0 title = Enum.at Enum.at(title, 0), 0
Enum.join String.split(title), " " Regex.replace %R/\s+/, title, " "
else else
"" ""
end end