From 18401f5ca2789bb2ffbb4dbb3ea9649fa9369f61 Mon Sep 17 00:00:00 2001 From: Zachary Vance Date: Mon, 29 Jul 2024 17:04:51 -0400 Subject: [PATCH] Problem survey --- blog | 8 +++++ html2markdown.js | 1 + markdown2html.py | 47 ++++++++++++++++++++++++-- pixel-compare.js | 48 +++++++++++++++++++++++--- problem_survey.txt | 34 +++++++++++++++++++ templates/post.mustache.html | 2 +- templates/postcombined.mustache.html | 10 ++++-- visualdiff-display.py | 49 +++++++++++++++++++++++++++ visualdiff.py | 50 ++++++++++++++++------------ wordpress2frontmatter.py | 9 +++-- 10 files changed, 224 insertions(+), 34 deletions(-) create mode 100644 problem_survey.txt create mode 100644 visualdiff-display.py diff --git a/blog b/blog index ad06892..54dd3dd 100755 --- a/blog +++ b/blog @@ -188,6 +188,7 @@ class Templatable(PseudoMap): html = mustache.render(template, context, warn=True) if replace_links: html = blog.replace_links(source, html) + html = blog.small_corrections(html) return html def content(self): @@ -235,6 +236,7 @@ class Post(Templatable): if k in {"tags", "author", "categories"}: k = "_" + k self[k] = v + self.post_title = self.title # Avoids a dumb thing with str.title in a template # TODO: Add an 'above the fold' part, maybe @@ -432,6 +434,12 @@ class Blog(PseudoMap): link_regex = '(? inside

is illegal + for figure in soup.find_all("figure"): + parent = figure.parent + while parent.name in ["a"]: + parent = parent.parent + if parent.name == "p" and len(list(parent.children)) == 1: + parent.unwrap() + + #for p in soup.select("blockquote > p"): + # if len(p.find_previous_siblings("p")) > 0: continue + # p['style'] = "color:#222222;" + #for p in soup.select("li > p"): + # p.unwrap() + return soup def markdown2html(html): html = markdown2.markdown(html, extras=['tables', 'header-ids', 'fenced-code-blocks']) - return html soup = BeautifulSoup(html, 'html.parser') soup = post_process(soup) - return soup.prettify() + return str(soup) #.prettify() diff --git a/pixel-compare.js b/pixel-compare.js index 31107a6..15f019a 100644 --- a/pixel-compare.js +++ b/pixel-compare.js @@ -1,25 +1,63 @@ const puppeteer = require('puppeteer') const path = require('path') const arguments = process.argv +const util = require('util'); +const exec = util.promisify(require('child_process').exec); -async function compare(url1, url2) { +function sleep(time) { + return new Promise((resolve) => setTimeout(resolve, time)) +} + +async function compare(url1, url2, save1, save2, diff) { const browser = await puppeteer.launch() const page = await browser.newPage() const options = { fullPage: true, + optimizeForSpeed: true, } await page.goto(`file://${path.join(__dirname, url1)}`) + options.path = save1 const render1 = await page.screenshot(options) await page.goto(`file://${path.join(__dirname, url2)}`) + options.path = save2 const render2 = await page.screenshot(options) - return !render1.compare(render2) + + const command = `compare -compose src ${save1} ${save2} ${diff}` + try { + await exec(command) + } catch(err) { + if (err.code !== 1) throw err + } + + const result = await exec(`magick ${diff} -alpha off -fill black +opaque "#cccccc" \\( +clone -evaluate set 0 \\) -metric AE -compare -format "%[distortion] %w %h\n" info:`) + + const [pixelsSame, width, height] = result.stdout.trim().split(" ") + + const identical = !render1.compare(render2) + const pixelsDifferent = height*width - pixelsSame + heightDifference = Math.abs(render1.length - render2.length) + var ret = { + identical, + heightDifference, + pixelsDifferent, + percentDifferent: pixelsDifferent / (pixelsDifferent + pixelsSame), + rowsDifferent: pixelsDifferent / width, + pixelsSame, + height, + width, + } + return ret } (async () => { -var same = await compare(process.argv[2], process.argv[3]); -if (same) process.exit(0) -else process.exit(1) +var ret = await compare(process.argv[2], process.argv[3], process.argv[4], process.argv[5], process.argv[6]); + +process.stdout.write(JSON.stringify(ret)) +process.stdout.write("\n") + +if (!ret.identical) process.exitCode = 1 +process.exit() })(); diff --git a/problem_survey.txt b/problem_survey.txt new file mode 100644 index 0000000..f387751 --- /dev/null +++ b/problem_survey.txt @@ -0,0 +1,34 @@ +Problem Survey + + +++++++++++++++ wp-code-block should be wp-block-code (installing-email-with-postfix-and-dovecot) +++++++ gallery layout information is being dropped -- rows (banh-chung) +image captions are being converted to

text. originals are: + +

(painting) + + div.wp-block-cover + +++++++ p.wp-caption-text (whiteboard-partition) + wrapped inside div.wp-caption, which adds whitespace +++ Original has no

after , just unwrapped text (diy-keyboards-and-how-keyboards-work) ++ image alignright dropped (whiteboard-partition) ++++ yt embed missing a little surrounding space. Original was wrapped in a figure.wp-block-embed.is-type-video.is-provider-youtube.wp-block-embed-youtube.wp-embed-aspect-16-9.wp-has-aspect-ratio (relay-music) ++++++ img.alignnone lost, which was providing a border (a-pixel-art): dontfix +++ img.aligncenter lost, which was providing centering (the-double-lives-of-books) and border (dontfix) ++
after img.alignnone displaying differently (steak-tartare-3) ++ figure class_ is being set, not class (steak-tartare-3) +++ image specified height and width (cron-email-and-sending-email-to-only-one-address) ++++

with an empty link. this is a problem in the markdown->HTML step for image links (moreorcs-com.html) + +++ or empty

only (printable-todo-list) +++ markdown bold breaking across lines. HTML->markdown broken (2020-books) +++++++ markdown version has additional whitespace are

 inside 
  • (mail-filtering-with-dovecot) ++
    converted to

    for table (understanding-gzip-2) +++ figure.wp-block-table should wrap a table (understanding-gzip-2) ++ several images in a single

    tag should be converted to a gallery row (default-twitter-icon) ++ markdown alt text is having markdown applied inside it. this is a bug in the markdown->HTML step (default-twitter-icon) ++ video got deleted (e-ink-laptop) ++

    being generated inside

  • , and adding too much space + ++
     got lost (archiving-twitter)
    ++ 
     messed up (xp-boot-usb-stick) -- problem in markdown to html conversion, i think
    ++ make font bigger (hack-a-day-hack-a-hang)
    ++ literal * messing with bolding (2022-books)
    ++ Link inside quote should be bolded (the-bible-translated-to-the-new-latin)
    diff --git a/templates/post.mustache.html b/templates/post.mustache.html
    index 2521633..f0a6f18 100644
    --- a/templates/post.mustache.html
    +++ b/templates/post.mustache.html
    @@ -35,7 +35,7 @@
     {{#main_display}}
     {{#comments}}
     
    -

    Responses to {{title}}

    +

    Responses to {{post_title}}

    {{& comments }}
    {{/comments}} diff --git a/templates/postcombined.mustache.html b/templates/postcombined.mustache.html index 85f2a3f..31d252e 100644 --- a/templates/postcombined.mustache.html +++ b/templates/postcombined.mustache.html @@ -62,8 +62,14 @@ iframe {
    diff --git a/visualdiff-display.py b/visualdiff-display.py new file mode 100644 index 0000000..4b32c6b --- /dev/null +++ b/visualdiff-display.py @@ -0,0 +1,49 @@ +""" +Display top problems +""" + +import csv +import json +import multiprocessing +import os +import pathlib +import subprocess +import sys +import time + +tqdm = lambda x, **kw: x +if sys.stdout.isatty(): + try: + from tqdm import tqdm + except ImportError: + pass + +def unsorted_parallel_map(f, lst, n=10): + with multiprocessing.Pool(n) as p: + yield from tqdm(p.imap_unordered(f, lst), total=len(lst)) + + +def sort_order(x): + return [ + x["pixel-perfect?"] == "False", + int(x["height-difference"]), + int(x["pixels-different"]), + x["post-id"], + ] + +def open_webpage(url): + subprocess.run(["chromium", url]) + +if __name__ == "__main__": + with open("visual-diff.csv", "r") as _csv: + csvfile = csv.DictReader(_csv, dialect="excel") + #csvfile.writerow(["post-id", "url", "local-url", "pixel-perfect?", "html-screenshot", "markdown-screenshot", "diff-screenshot", "height-difference", "pixels-different"]) + rows = list(csvfile) + + ordered = sorted(rows, key=sort_order, reverse=True) + ordered = [x for x in ordered if x["pixel-perfect?"] == "False"] + #for x in ordered[:10]: + # print(x["post-id"], x["height-difference"], x["pixels-different"], x["pixel-perfect?"]) + for x in ordered:#[:30]: + print(x["local-url"]) + open_webpage(x["local-url"]) diff --git a/visualdiff.py b/visualdiff.py index 7b78469..c4c7e34 100644 --- a/visualdiff.py +++ b/visualdiff.py @@ -4,12 +4,13 @@ Assumes they are already generated """ import csv -import pathlib -import subprocess -import time +import json import multiprocessing import os +import pathlib +import subprocess import sys +import time tqdm = lambda x, **kw: x if sys.stdout.isatty(): @@ -22,50 +23,57 @@ def unsorted_parallel_map(f, lst, n=10): with multiprocessing.Pool(n) as p: yield from tqdm(p.imap_unordered(f, lst), total=len(lst)) -def pixel_compare(path1, path2): - return subprocess.run(["node", "pixel-compare.js", path1, path2], stderr=subprocess.DEVNULL).returncode == 0 +def pixel_compare(*args): + result = subprocess.run(["node", "pixel-compare.js"] + list(args), capture_output=True) + identical = result.returncode == 0 + output = result.stdout.decode('utf8') + try: + result = json.loads(output) + except: + print(args, output, file=sys.stderr) + raise -def html_compare(path1, path2): - return False + return identical, result def blog_articles(): return sorted(x.stem for x in pathlib.Path("posts-html").iterdir()) -def compare(post_id): +def compare(post_id, save=False): url = "https://blog2.za3k.com/posts/{}.html".format(post_id) + local_url = "file:///home/zachary/blog/public/posts/{}.html".format(post_id) html_path = "public/posts/{}.orig.html".format(post_id) markdown_path = "public/posts/{}.md.html".format(post_id) - pixel_identical = pixel_compare(html_path, markdown_path) - html_identical = html_compare(html_path, markdown_path) - return [post_id, url, pixel_identical, html_identical] + html_screenshot_path = "screenshots/{}.html.png".format(post_id) + markdown_screenshot_path = "screenshots/{}.md.png".format(post_id) + visual_diff_path = "screenshots/{}.diff.png".format(post_id) + pixel_identical, ret = pixel_compare(html_path, markdown_path, html_screenshot_path, markdown_screenshot_path, visual_diff_path) + return [post_id, url, local_url, pixel_identical, html_screenshot_path, markdown_screenshot_path, visual_diff_path, ret["heightDifference"], ret["pixelsDifferent"]] if __name__ == "__main__": start_time = time.time() with open("visual-diff.csv", "w") as _csv: csvfile = csv.writer(_csv, dialect="excel") - csvfile.writerow(["post-id", "url", "pixel-perfect?", "html-identical?"]) + csvfile.writerow(["post-id", "url", "local-url", "pixel-perfect?", "html-screenshot", "markdown-screenshot", "diff-screenshot", "height-difference", "pixels-different"]) rows = sorted(unsorted_parallel_map(compare, blog_articles())) csvfile.writerows(rows) time_elapsed = time.time() - start_time - both_identical = len([x for x in rows if x[2] and x[3]]) - pixel_identical = len([x for x in rows if x[2]]) - both_identical - html_identical = len([x for x in rows if x[3]]) - both_identical total = len(rows) - neither_identical = total - pixel_identical - html_identical - both_identical + pixel_identical = len([x for x in rows if x[3]]) + not_identical = total - pixel_identical + example_failure = sorted([x[0] for x in rows if not x[3]])[0] print(" Progress Tracker\n") - print(" pixel:NO pixel: YES ") + print(" DIFFERENT SAME ") print(" |-----------|-----------|") - print(" html:NO | {: >2.0f}% | {: >2.0f}% | {: >2.0f}%".format(neither_identical/total*100, pixel_identical/total*100, (neither_identical + pixel_identical)/total*100)) + print(" | {: >3.0f}% | {: >3.0f}% | 100%".format(not_identical/total*100, pixel_identical/total*100)) print(" |-----------|-----------|") - print(" html:YES | {: >2.0f}% | {: >2.0f}% | {: >2.0f}%".format(html_identical/total*100, both_identical/total*100, (html_identical + both_identical)/total*100)) + print(" | {: >3d} | {: >3d} | {: >3d}".format(not_identical, pixel_identical, total)) print(" |-----------|-----------|") - print(" {: >2.0f}% {: >2.0f}%".format((neither_identical + html_identical)/total*100, (pixel_identical+both_identical)/total*100)) print() - print("Posts: {}".format(total)) print("Time: {:.0f}s".format(time_elapsed)) print("Time per file: {:.2f}s".format(time_elapsed/total)) + print("Next failure: {}".format(example_failure)) os.system("rm -r /tmp/puppeteer_dev_chrome_profile-X*") diff --git a/wordpress2frontmatter.py b/wordpress2frontmatter.py index 7b6acec..54bcd53 100644 --- a/wordpress2frontmatter.py +++ b/wordpress2frontmatter.py @@ -5,7 +5,7 @@ import yaml from pathlib import Path INPUT_DIR = Path("/home/zachary/blog.za3k.com") -OUTPUT_DIR = Path("/home/zachary/blog/posts") +OUTPUT_DIR = Path("/home/zachary/blog/posts-html") IMAGES = OUTPUT_DIR / 'images' BLACKLIST={"wp-json", "feed"} @@ -23,9 +23,14 @@ def parse_date(s): def scrape_post(post): html = bs4.BeautifulSoup(post, 'html.parser') article = html.find('article') - comments = html.find('ol', class_="commentlist") post = article.find('div', class_="entry-content") + for x in html.select('ol.commentlist > li.pingback'): + x.extract() + comments = html.find('ol', class_="commentlist") + if comments and len(comments.find_all('li')) == 0: + comments = None + result = {} result["html_content"] = str(post) result["html_comments"] = (str(comments) if comments else "") -- 2.47.3