Add www. support, cut url from #, clean titles with excess whitespace
This commit is contained in:
parent
8e9120f88a
commit
f1d7e8a753
1 changed files with 11 additions and 4 deletions
|
@ -68,8 +68,8 @@ defmodule Nulform.Plugins.URLAnalyzer do
|
||||||
(?::\d+)? # Optional port number
|
(?::\d+)? # Optional port number
|
||||||
(?:
|
(?:
|
||||||
(?:/[^?\s]*)+ # URL path, anything non-?, non-ws separated by /
|
(?:/[^?\s]*)+ # URL path, anything non-?, non-ws separated by /
|
||||||
(?:\?(?:\S*))? # Optional query string, anything after ?
|
(?:\?\S*)? # Optional query string, anything after ?, up to #
|
||||||
)? # Make the whole path & query part optional
|
)? # Make the whole path & query part optional
|
||||||
)
|
)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -146,8 +146,15 @@ defmodule Nulform.Plugins.URLAnalyzer do
|
||||||
def analyze_url(url, redirects, mode) do
|
def analyze_url(url, redirects, mode) do
|
||||||
IO.puts "Analyzing " <> url <> " round " <> to_binary redirects
|
IO.puts "Analyzing " <> url <> " round " <> to_binary redirects
|
||||||
title = ""
|
title = ""
|
||||||
domain = URI.parse(url).authority
|
|
||||||
|
|
||||||
|
if String.starts_with? url, "www." do
|
||||||
|
url = "http://" <> url
|
||||||
|
end
|
||||||
|
|
||||||
|
# Strip anchor
|
||||||
|
url = Enum.at String.split(url, "#"), 0
|
||||||
|
|
||||||
|
domain = URI.parse(url).authority
|
||||||
case match_blacklist URI.parse(url).host do
|
case match_blacklist URI.parse(url).host do
|
||||||
true -> {:error, :blacklisted, domain}
|
true -> {:error, :blacklisted, domain}
|
||||||
false ->
|
false ->
|
||||||
|
@ -214,7 +221,7 @@ defmodule Nulform.Plugins.URLAnalyzer do
|
||||||
|
|
||||||
if not Enum.empty? title do
|
if not Enum.empty? title do
|
||||||
title = Enum.at Enum.at(title, 0), 0
|
title = Enum.at Enum.at(title, 0), 0
|
||||||
Enum.join String.split(title), " "
|
Regex.replace %R/\s+/, title, " "
|
||||||
else
|
else
|
||||||
""
|
""
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in a new issue