diff --git a/lib/archive/schemas/page_view.ex b/lib/archive/schemas/page_view.ex index 6a76e20..6fa35d5 100644 --- a/lib/archive/schemas/page_view.ex +++ b/lib/archive/schemas/page_view.ex @@ -16,7 +16,7 @@ defmodule Tilastokeskus.Archive.Schemas.PageView do # Request path without query string field(:path_noq, :string) - # Request host header + # Request URL host (authority) field(:host, :string) # Full HTTP referrer diff --git a/lib/archive/scrubinator.ex b/lib/archive/scrubinator.ex new file mode 100644 index 0000000..5d427db --- /dev/null +++ b/lib/archive/scrubinator.ex @@ -0,0 +1,62 @@ +defmodule Tilastokeskus.Archive.Scrubinator do + @moduledoc """ + Scrubinator is a timed assassin that periodically scrubs log data from too sensitive + information. + """ + + @how_often 24 * 60 * 60 * 1000 + + use GenServer + alias Tilastokeskus.Archive.Schemas.PageView + alias Tilastokeskus.Archive.Repo + import Ecto.Query, only: [from: 2] + + def start_link(opts) do + GenServer.start_link(__MODULE__, opts) + end + + def init(%{days: days} = state) do + :ok = scrub(days) + + Process.send_after(self(), :scrub, @how_often) + {:ok, state} + end + + def handle_info(:scrub, %{days: days} = state) do + :ok = scrub(days) + + Process.send_after(self(), :scrub, @how_often) + {:noreply, state} + end + + @doc """ + Scrub hits older than `days` days of private data. + """ + @spec scrub(integer) :: :ok + def scrub(days) do + now = DateTime.utc_now() |> DateTime.to_unix() + then = now - days * 24 * 60 * 60 + + case DateTime.from_unix(then) do + {:ok, then_dt} -> + from( + p in PageView, + where: p.at <= ^then_dt and p.scrubbed == false, + update: [ + set: [ + scrubbed: true, + addr: nil, + ua: nil, + loc_city: nil + ] + ] + ) + |> Repo.update_all([]) + + :ok + + {:error, _} -> + :ok + end + end +end diff --git a/lib/tilastokeskus/application.ex b/lib/tilastokeskus/application.ex index da87bdf..4d77416 100644 --- a/lib/tilastokeskus/application.ex +++ b/lib/tilastokeskus/application.ex @@ -10,10 +10,12 @@ defmodule Tilastokeskus.Application do port = (System.get_env("PORT") || "1971") |> String.to_integer() hosts = get_hosts() + days = get_days() # List all child processes to be supervised children = [ {Tilastokeskus.Archive.Repo, []}, + {Tilastokeskus.Archive.Scrubinator, %{days: days}}, {Tilastokeskus.Reception.Router, [[hosts: hosts], [port: port]]} ] @@ -29,4 +31,11 @@ defmodule Tilastokeskus.Application do hosts -> String.split(hosts, ",") end end + + defp get_days() do + case System.get_env("TILASTOKESKUS_SCRUB_DAYS") do + nil -> 90 + days -> String.to_integer(days) + end + end end