html = mustache.render(template, context, warn=True)
if replace_links:
html = blog.replace_links(source, html)
+ html = blog.small_corrections(html)
return html
def content(self):
if k in {"tags", "author", "categories"}:
k = "_" + k
self[k] = v
+ self.post_title = self.title # Avoids a dumb thing with str.title in a template
# TODO: Add an 'above the fold' part, maybe
link_regex = '(?<!srcset=")(?<=")https://blog.za3k.com/([^"]*)(?=")'
return re.sub(link_regex, lambda m: self.rewrite_link(source, m), html)
+ def small_corrections(self, html):
+ # This helps the visual diff tool work for pages with many images
+ html = html.replace('decoding="async"', '')
+ html = html.replace('loading="lazy"', '')
+ return html
+
def rewrite_link(self, source, match):
link = Link(match.group(0), self, source.output_path)
self.links.add(link)
turndownService.use(turndownPluginGfm.tables)
turndownService.use(myPlugin.pre)
turndownService.keep('iframe')
+turndownService.keep('video')
//turndownService.use(myPlugin.preserveHTMLMatching)
const html = fs.readFileSync(0, "utf-8");
import markdown2
from bs4 import BeautifulSoup
+
def post_process(soup):
- pass
+ for img in soup.find_all("img"):
+ figure = soup.new_tag("figure")
+ figure['class'] = "wp-block-image"
+ img.wrap(figure)
+
+ for pre in soup.find_all("pre"):
+ pre['class'] = "wp-block-code"
+
+ for hr in soup.find_all("hr"):
+ hr['class'] = "wp-block-separator"
+
+ for table in soup.find_all("table"):
+ figure = soup.new_tag("figure")
+ figure['class'] = "wp-block-table"
+ table.wrap(figure)
+
+ for video in soup.find_all("video"):
+ figure = soup.new_tag("figure")
+ figure['class'] = "wp-block-video"
+ video.wrap(figure)
+
+ # YT embeds
+ for iframe in soup.find_all("iframe"):
+ if 'youtube.com' in iframe['src']:
+ figure = soup.new_tag("figure")
+ figure['class'] = "wp-block-embed"
+ iframe.wrap(figure)
+
+ # <figure> inside <p> is illegal
+ for figure in soup.find_all("figure"):
+ parent = figure.parent
+ while parent.name in ["a"]:
+ parent = parent.parent
+ if parent.name == "p" and len(list(parent.children)) == 1:
+ parent.unwrap()
+
+ #for p in soup.select("blockquote > p"):
+ # if len(p.find_previous_siblings("p")) > 0: continue
+ # p['style'] = "color:#222222;"
+ #for p in soup.select("li > p"):
+ # p.unwrap()
+ return soup
def markdown2html(html):
html = markdown2.markdown(html, extras=['tables', 'header-ids', 'fenced-code-blocks'])
- return html
soup = BeautifulSoup(html, 'html.parser')
soup = post_process(soup)
- return soup.prettify()
+ return str(soup) #.prettify()
const puppeteer = require('puppeteer')
const path = require('path')
const arguments = process.argv
+const util = require('util');
+const exec = util.promisify(require('child_process').exec);
-async function compare(url1, url2) {
+function sleep(time) {
+ return new Promise((resolve) => setTimeout(resolve, time))
+}
+
+async function compare(url1, url2, save1, save2, diff) {
const browser = await puppeteer.launch()
const page = await browser.newPage()
const options = {
fullPage: true,
+ optimizeForSpeed: true,
}
await page.goto(`file://${path.join(__dirname, url1)}`)
+ options.path = save1
const render1 = await page.screenshot(options)
await page.goto(`file://${path.join(__dirname, url2)}`)
+ options.path = save2
const render2 = await page.screenshot(options)
- return !render1.compare(render2)
+
+ const command = `compare -compose src ${save1} ${save2} ${diff}`
+ try {
+ await exec(command)
+ } catch(err) {
+ if (err.code !== 1) throw err
+ }
+
+ const result = await exec(`magick ${diff} -alpha off -fill black +opaque "#cccccc" \\( +clone -evaluate set 0 \\) -metric AE -compare -format "%[distortion] %w %h\n" info:`)
+
+ const [pixelsSame, width, height] = result.stdout.trim().split(" ")
+
+ const identical = !render1.compare(render2)
+ const pixelsDifferent = height*width - pixelsSame
+ heightDifference = Math.abs(render1.length - render2.length)
+ var ret = {
+ identical,
+ heightDifference,
+ pixelsDifferent,
+ percentDifferent: pixelsDifferent / (pixelsDifferent + pixelsSame),
+ rowsDifferent: pixelsDifferent / width,
+ pixelsSame,
+ height,
+ width,
+ }
+ return ret
}
(async () => {
-var same = await compare(process.argv[2], process.argv[3]);
-if (same) process.exit(0)
-else process.exit(1)
+var ret = await compare(process.argv[2], process.argv[3], process.argv[4], process.argv[5], process.argv[6]);
+
+process.stdout.write(JSON.stringify(ret))
+process.stdout.write("\n")
+
+if (!ret.identical) process.exitCode = 1
+process.exit()
})();
--- /dev/null
+Problem Survey
+
+
+++++++++++++++ wp-code-block should be wp-block-code (installing-email-with-postfix-and-dovecot)
+++++++ gallery layout information is being dropped -- rows (banh-chung)
+image captions are being converted to <p> text. originals are:
+ + <figcaption> (painting)
+ + div.wp-block-cover
+ +++++++ p.wp-caption-text (whiteboard-partition)
+ wrapped inside div.wp-caption, which adds whitespace
+++ Original has no <p> after <img>, just unwrapped text (diy-keyboards-and-how-keyboards-work)
++ image alignright dropped (whiteboard-partition)
++++ yt embed missing a little surrounding space. Original was wrapped in a figure.wp-block-embed.is-type-video.is-provider-youtube.wp-block-embed-youtube.wp-embed-aspect-16-9.wp-has-aspect-ratio (relay-music)
++++++ img.alignnone lost, which was providing a border (a-pixel-art): dontfix
+++ img.aligncenter lost, which was providing centering (the-double-lives-of-books) and border (dontfix)
++ <br> after img.alignnone displaying differently (steak-tartare-3)
++ figure class_ is being set, not class (steak-tartare-3)
+++ image specified height and width (cron-email-and-sending-email-to-only-one-address)
++++ <p> with an empty link. this is a problem in the markdown->HTML step for image links (moreorcs-com.html)
+ +++ or empty <p> only (printable-todo-list)
+++ markdown bold breaking across lines. HTML->markdown broken (2020-books)
+++++++ markdown version has additional whitespace are <pre> inside <li> (mail-filtering-with-dovecot)
++ <figcaption> converted to <p> for table (understanding-gzip-2)
+++ figure.wp-block-table should wrap a table (understanding-gzip-2)
++ several images in a single <p> tag should be converted to a gallery row (default-twitter-icon)
++ markdown alt text is having markdown applied inside it. this is a bug in the markdown->HTML step (default-twitter-icon)
++ video got deleted (e-ink-laptop)
++ <p> being generated inside <li>, and adding too much space
+
++ <pre> got lost (archiving-twitter)
++ <pre> messed up (xp-boot-usb-stick) -- problem in markdown to html conversion, i think
++ make font bigger (hack-a-day-hack-a-hang)
++ literal * messing with bolding (2022-books)
++ Link inside quote should be bolded (the-bible-translated-to-the-new-latin)
{{#main_display}}
{{#comments}}
<div id="comments">
- <h3 id="comments-title">Responses to <em>{{title}}</em></h3>
+ <h3 id="comments-title">Responses to <em>{{post_title}}</em></h3>
{{& comments }}
</div>
{{/comments}}
<body>
<ul id="links">
<a href="https://blog.za3k.com/{{id}}/">original blog</a>
- <a href="https://git.za3k.com/?p=blog.git;a=blob_plain;f=posts-html/{{id}}.html;hb=HEAD">html source</a>
- <a href="https://git.za3k.com/?p=blog.git;a=blob_plain;f=posts-md/{{id}}.md;hb=HEAD">markdown source</a>
+ <span><a href="https://git.za3k.com/?p=blog.git;a=blob_plain;f=posts-html/{{id}}.html;hb=HEAD">html source</a> <a href="file:///home/zachary/blog/posts-html/{{id}}.html">(local)</a></span>
+ <span><a href="https://git.za3k.com/?p=blog.git;a=blob_plain;f=posts-md/{{id}}.md;hb=HEAD">markdown source</a> <a href="file:///home/zachary/blog/posts-md/{{id}}.md">(local)</a></span>
+ <span>
+ Images:
+ <a href="file:///home/zachary/blog/screenshots/{{id}}.diff.png">diff</a> /
+ <a href="file:///home/zachary/blog/screenshots/{{id}}.html.png">html</a> /
+ <a href="file:///home/zachary/blog/screenshots/{{id}}.md.png">md</a>
+ </span>
</ul>
<div id="iframes">
<div id="html">
--- /dev/null
+"""
+Display top problems
+"""
+
+import csv
+import json
+import multiprocessing
+import os
+import pathlib
+import subprocess
+import sys
+import time
+
+tqdm = lambda x, **kw: x
+if sys.stdout.isatty():
+ try:
+ from tqdm import tqdm
+ except ImportError:
+ pass
+
+def unsorted_parallel_map(f, lst, n=10):
+ with multiprocessing.Pool(n) as p:
+ yield from tqdm(p.imap_unordered(f, lst), total=len(lst))
+
+
+def sort_order(x):
+ return [
+ x["pixel-perfect?"] == "False",
+ int(x["height-difference"]),
+ int(x["pixels-different"]),
+ x["post-id"],
+ ]
+
+def open_webpage(url):
+ subprocess.run(["chromium", url])
+
+if __name__ == "__main__":
+ with open("visual-diff.csv", "r") as _csv:
+ csvfile = csv.DictReader(_csv, dialect="excel")
+ #csvfile.writerow(["post-id", "url", "local-url", "pixel-perfect?", "html-screenshot", "markdown-screenshot", "diff-screenshot", "height-difference", "pixels-different"])
+ rows = list(csvfile)
+
+ ordered = sorted(rows, key=sort_order, reverse=True)
+ ordered = [x for x in ordered if x["pixel-perfect?"] == "False"]
+ #for x in ordered[:10]:
+ # print(x["post-id"], x["height-difference"], x["pixels-different"], x["pixel-perfect?"])
+ for x in ordered:#[:30]:
+ print(x["local-url"])
+ open_webpage(x["local-url"])
"""
import csv
-import pathlib
-import subprocess
-import time
+import json
import multiprocessing
import os
+import pathlib
+import subprocess
import sys
+import time
tqdm = lambda x, **kw: x
if sys.stdout.isatty():
with multiprocessing.Pool(n) as p:
yield from tqdm(p.imap_unordered(f, lst), total=len(lst))
-def pixel_compare(path1, path2):
- return subprocess.run(["node", "pixel-compare.js", path1, path2], stderr=subprocess.DEVNULL).returncode == 0
+def pixel_compare(*args):
+ result = subprocess.run(["node", "pixel-compare.js"] + list(args), capture_output=True)
+ identical = result.returncode == 0
+ output = result.stdout.decode('utf8')
+ try:
+ result = json.loads(output)
+ except:
+ print(args, output, file=sys.stderr)
+ raise
-def html_compare(path1, path2):
- return False
+ return identical, result
def blog_articles():
return sorted(x.stem for x in pathlib.Path("posts-html").iterdir())
-def compare(post_id):
+def compare(post_id, save=False):
url = "https://blog2.za3k.com/posts/{}.html".format(post_id)
+ local_url = "file:///home/zachary/blog/public/posts/{}.html".format(post_id)
html_path = "public/posts/{}.orig.html".format(post_id)
markdown_path = "public/posts/{}.md.html".format(post_id)
- pixel_identical = pixel_compare(html_path, markdown_path)
- html_identical = html_compare(html_path, markdown_path)
- return [post_id, url, pixel_identical, html_identical]
+ html_screenshot_path = "screenshots/{}.html.png".format(post_id)
+ markdown_screenshot_path = "screenshots/{}.md.png".format(post_id)
+ visual_diff_path = "screenshots/{}.diff.png".format(post_id)
+ pixel_identical, ret = pixel_compare(html_path, markdown_path, html_screenshot_path, markdown_screenshot_path, visual_diff_path)
+ return [post_id, url, local_url, pixel_identical, html_screenshot_path, markdown_screenshot_path, visual_diff_path, ret["heightDifference"], ret["pixelsDifferent"]]
if __name__ == "__main__":
start_time = time.time()
with open("visual-diff.csv", "w") as _csv:
csvfile = csv.writer(_csv, dialect="excel")
- csvfile.writerow(["post-id", "url", "pixel-perfect?", "html-identical?"])
+ csvfile.writerow(["post-id", "url", "local-url", "pixel-perfect?", "html-screenshot", "markdown-screenshot", "diff-screenshot", "height-difference", "pixels-different"])
rows = sorted(unsorted_parallel_map(compare, blog_articles()))
csvfile.writerows(rows)
time_elapsed = time.time() - start_time
- both_identical = len([x for x in rows if x[2] and x[3]])
- pixel_identical = len([x for x in rows if x[2]]) - both_identical
- html_identical = len([x for x in rows if x[3]]) - both_identical
total = len(rows)
- neither_identical = total - pixel_identical - html_identical - both_identical
+ pixel_identical = len([x for x in rows if x[3]])
+ not_identical = total - pixel_identical
+ example_failure = sorted([x[0] for x in rows if not x[3]])[0]
print(" Progress Tracker\n")
- print(" pixel:NO pixel: YES ")
+ print(" DIFFERENT SAME ")
print(" |-----------|-----------|")
- print(" html:NO | {: >2.0f}% | {: >2.0f}% | {: >2.0f}%".format(neither_identical/total*100, pixel_identical/total*100, (neither_identical + pixel_identical)/total*100))
+ print(" | {: >3.0f}% | {: >3.0f}% | 100%".format(not_identical/total*100, pixel_identical/total*100))
print(" |-----------|-----------|")
- print(" html:YES | {: >2.0f}% | {: >2.0f}% | {: >2.0f}%".format(html_identical/total*100, both_identical/total*100, (html_identical + both_identical)/total*100))
+ print(" | {: >3d} | {: >3d} | {: >3d}".format(not_identical, pixel_identical, total))
print(" |-----------|-----------|")
- print(" {: >2.0f}% {: >2.0f}%".format((neither_identical + html_identical)/total*100, (pixel_identical+both_identical)/total*100))
print()
- print("Posts: {}".format(total))
print("Time: {:.0f}s".format(time_elapsed))
print("Time per file: {:.2f}s".format(time_elapsed/total))
+ print("Next failure: {}".format(example_failure))
os.system("rm -r /tmp/puppeteer_dev_chrome_profile-X*")
from pathlib import Path
INPUT_DIR = Path("/home/zachary/blog.za3k.com")
-OUTPUT_DIR = Path("/home/zachary/blog/posts")
+OUTPUT_DIR = Path("/home/zachary/blog/posts-html")
IMAGES = OUTPUT_DIR / 'images'
BLACKLIST={"wp-json", "feed"}
def scrape_post(post):
html = bs4.BeautifulSoup(post, 'html.parser')
article = html.find('article')
- comments = html.find('ol', class_="commentlist")
post = article.find('div', class_="entry-content")
+ for x in html.select('ol.commentlist > li.pingback'):
+ x.extract()
+ comments = html.find('ol', class_="commentlist")
+ if comments and len(comments.find_all('li')) == 0:
+ comments = None
+
result = {}
result["html_content"] = str(post)
result["html_comments"] = (str(comments) if comments else "")