This repository has been archived on 2024-06-16. You can view files and clone it, but cannot push or open issues or pull requests.
mebe/lib/mebe_engine/crawler.ex

84 lines
2.5 KiB
Elixir
Raw Normal View History

2015-05-14 20:58:25 +00:00
defmodule MebeEngine.Crawler do
@moduledoc """
The crawler goes through the specified directory, opening and parsing all the matching files
inside concurrently.
"""
require Logger
2015-05-14 20:58:25 +00:00
alias MebeEngine.Parser
alias MebeEngine.Models.Page
alias MebeEngine.Models.Post
alias MebeWeb.Utils
2015-05-14 20:58:25 +00:00
def crawl(path) do
get_files(path)
|> Enum.map(fn file -> Task.async MebeEngine.Crawler, :parse, [file] end)
|> handle_responses
|> construct_archives
end
def get_files(path) do
path = path <> "/**/*.md"
Logger.info "Searching files using '#{path}' with cwd '#{System.cwd}'"
files = Path.wildcard path
Logger.info "Found files:"
for file <- files do
Logger.info file
end
files
2015-05-14 20:58:25 +00:00
end
def parse(file) do
File.read!(file)
|> Parser.parse(Path.basename file)
end
def handle_responses(tasklist) do
Enum.map tasklist, fn task -> Task.await task end
end
def construct_archives(datalist) do
multi_author_mode = Utils.get_conf(:multi_author_mode)
Enum.reduce datalist, %{pages: %{}, posts: [], years: %{}, months: %{}, tags: %{}, authors: %{}, author_names: %{}}, fn pagedata, acc ->
2015-05-14 20:58:25 +00:00
case pagedata.__struct__ do
Page -> %{acc | pages: Map.put(acc.pages, pagedata.slug, pagedata)}
Post ->
{year, month, _} = pagedata.date
tags = Enum.reduce(pagedata.tags, acc.tags, fn tag, tagmap ->
2015-05-14 20:58:25 +00:00
posts = Map.get(tagmap, tag, [])
Map.put(tagmap, tag, [pagedata | posts])
end)
authors = %{}
author_names = %{}
if multi_author_mode do
author_name = Utils.get_author(pagedata)
author_slug = Utils.slugify(author_name)
author_posts = [pagedata | Map.get(acc.authors, author_slug, [])]
authors = Map.put(acc.authors, author_slug, author_posts)
# Authors end up with the name that was in the post with the first matching slug
author_names = Map.put_new(acc.author_names, author_slug, author_name)
2015-05-14 20:58:25 +00:00
end
year_posts = [pagedata | Map.get(acc.years, year, [])]
month_posts = [pagedata | Map.get(acc.months, {year, month}, [])]
2015-05-14 20:58:25 +00:00
%{
acc |
posts: [pagedata | acc.posts],
years: Map.put(acc.years, year, year_posts),
months: Map.put(acc.months, {year, month}, month_posts),
tags: tags,
authors: authors,
author_names: author_names
2015-05-14 20:58:25 +00:00
}
end
end
end
end