Add www. support, cut url from #, clean titles with excess whitespace

This commit is contained in:
Mikko Ahlroth 2013-08-04 00:49:30 +03:00
parent 8e9120f88a
commit f1d7e8a753

View file

@ -68,8 +68,8 @@ defmodule Nulform.Plugins.URLAnalyzer do
(?::\d+)? # Optional port number
(?:
(?:/[^?\s]*)+ # URL path, anything non-?, non-ws separated by /
(?:\?(?:\S*))? # Optional query string, anything after ?
)? # Make the whole path & query part optional
(?:\?\S*)? # Optional query string, anything after ?, up to #
)? # Make the whole path & query part optional
)
"""
@ -146,8 +146,15 @@ defmodule Nulform.Plugins.URLAnalyzer do
def analyze_url(url, redirects, mode) do
IO.puts "Analyzing " <> url <> " round " <> to_binary redirects
title = ""
domain = URI.parse(url).authority
if String.starts_with? url, "www." do
url = "http://" <> url
end
# Strip anchor
url = Enum.at String.split(url, "#"), 0
domain = URI.parse(url).authority
case match_blacklist URI.parse(url).host do
true -> {:error, :blacklisted, domain}
false ->
@ -214,7 +221,7 @@ defmodule Nulform.Plugins.URLAnalyzer do
if not Enum.empty? title do
title = Enum.at Enum.at(title, 0), 0
Enum.join String.split(title), " "
Regex.replace %R/\s+/, title, " "
else
""
end