From 8863fd94424695a7e974382077d7a810b2058ce6 Mon Sep 17 00:00:00 2001 From: Zachary Vance Date: Mon, 1 Jul 2024 14:24:02 -0400 Subject: [PATCH] Initial commit --- blog2html.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 blog2html.py diff --git a/blog2html.py b/blog2html.py new file mode 100644 index 0000000..e8e3334 --- /dev/null +++ b/blog2html.py @@ -0,0 +1,62 @@ +import bs4 +from datetime import datetime +import yaml + +from pathlib import Path + +INPUT_DIR = Path("/home/zachary/blog.za3k.com") +OUTPUT_DIR = Path("/home/zachary/blog_converter/posts") +IMAGES = OUTPUT_DIR / 'images' + +BLACKLIST={"wp-json", "feed"} +def posts(): + for possible_post in INPUT_DIR.iterdir(): + if possible_post.is_dir() and possible_post.name not in BLACKLIST: + possible_post /= "index.html" + if possible_post.is_file(): + yield possible_post + +def parse_date(s): + # 2023-07-17T13:58:49-07:00 + return datetime.strptime(s, "%Y-%m-%dT%H:%M:%S%z") + +def scrape_post(post): + html = bs4.BeautifulSoup(post, 'html.parser') + article = html.find('article') + comments = html.find('ol', id="commentlist") + + result = {} + result["html_content"] = str(article) + result["html_comments"] = (str(comments) if comments else "") + result["title"] = article.find('h1', class_="entry-title").get_text() + assert result["title"] == html.find('title').get_text() + result["date"] = parse_date(html.find('time', class_="published")['datetime']) + if updated := html.find('time', class_="updated"): + result["updated"] = parse_date(updated['datetime']) + result["wordpress_id"] = int(article['id'].removeprefix("post-")) + result["categories"] = [link.get_text() for link in article.select(".bl_categ a[rel=tag]")] + result["tags"] = [link.get_text() for link in article.select(".bl_posted a[rel=tag]")] + result["source"] = "wordpress" + result["author"] = article.find(rel="author").get_text() + result["markup"] = "html" + + return result + +def output_post(post): + content = "\n" + post.pop("html_content") + "\n\n\n" + post.pop("html_comments") + front_matter = yaml.dump(post) + return "---\n{front_matter}---\n{content}".format(content=content, front_matter=front_matter) + +import sys +if __name__ == "__main__": + for post_path in posts(): + slug = post_path.parts[-2] + output_path = OUTPUT_DIR / (slug + ".html") + with open(post_path, "r") as f: + post = f.read() + parse = scrape_post(post) + parse["wordpress_slug"] = slug + output = output_post(parse) + with open(output_path, "w") as f: + f.write(output) + -- 2.47.3