From f1d7e8a753c8f5e106550929ded4cfda84444d1d Mon Sep 17 00:00:00 2001 From: Mikko Ahlroth Date: Sun, 4 Aug 2013 00:49:30 +0300 Subject: [PATCH] Add www. support, cut url from #, clean titles with excess whitespace --- lib/plugins/urlanalyzer.ex | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/lib/plugins/urlanalyzer.ex b/lib/plugins/urlanalyzer.ex index 1e19372..9d27c2e 100644 --- a/lib/plugins/urlanalyzer.ex +++ b/lib/plugins/urlanalyzer.ex @@ -68,8 +68,8 @@ defmodule Nulform.Plugins.URLAnalyzer do (?::\d+)? # Optional port number (?: (?:/[^?\s]*)+ # URL path, anything non-?, non-ws separated by / - (?:\?(?:\S*))? # Optional query string, anything after ? - )? # Make the whole path & query part optional + (?:\?\S*)? # Optional query string, anything after ?, up to # + )? # Make the whole path & query part optional ) """ @@ -146,8 +146,15 @@ defmodule Nulform.Plugins.URLAnalyzer do def analyze_url(url, redirects, mode) do IO.puts "Analyzing " <> url <> " round " <> to_binary redirects title = "" - domain = URI.parse(url).authority + if String.starts_with? url, "www." do + url = "http://" <> url + end + + # Strip anchor + url = Enum.at String.split(url, "#"), 0 + + domain = URI.parse(url).authority case match_blacklist URI.parse(url).host do true -> {:error, :blacklisted, domain} false -> @@ -214,7 +221,7 @@ defmodule Nulform.Plugins.URLAnalyzer do if not Enum.empty? title do title = Enum.at Enum.at(title, 0), 0 - Enum.join String.split(title), " " + Regex.replace %R/\s+/, title, " " else "" end