]> git.za3k.com Git - blog.git/commitdiff
Problem survey
authorZachary Vance <za3k@za3k.com>
Mon, 29 Jul 2024 21:04:51 +0000 (17:04 -0400)
committerZachary Vance <za3k@za3k.com>
Mon, 29 Jul 2024 21:04:51 +0000 (17:04 -0400)
blog
html2markdown.js
markdown2html.py
pixel-compare.js
problem_survey.txt [new file with mode: 0644]
templates/post.mustache.html
templates/postcombined.mustache.html
visualdiff-display.py [new file with mode: 0644]
visualdiff.py
wordpress2frontmatter.py

diff --git a/blog b/blog
index ad06892d31d45dd173e99d85a022ac78afcdf4af..54dd3dd5e7987cd21208b3cc2907d4882d725281 100755 (executable)
--- a/blog
+++ b/blog
@@ -188,6 +188,7 @@ class Templatable(PseudoMap):
         html = mustache.render(template, context, warn=True)
         if replace_links:
             html = blog.replace_links(source, html)
+        html = blog.small_corrections(html)
         return html
 
     def content(self):
@@ -235,6 +236,7 @@ class Post(Templatable):
             if k in {"tags", "author", "categories"}:
                 k = "_" + k
             self[k] = v
+        self.post_title = self.title # Avoids a dumb thing with str.title in a template
 
     # TODO: Add an 'above the fold' part, maybe
 
@@ -432,6 +434,12 @@ class Blog(PseudoMap):
         link_regex = '(?<!srcset=")(?<=")https://blog.za3k.com/([^"]*)(?=")'
         return re.sub(link_regex, lambda m: self.rewrite_link(source, m), html)
 
+    def small_corrections(self, html):
+        # This helps the visual diff tool work for pages with many images
+        html = html.replace('decoding="async"', '') 
+        html = html.replace('loading="lazy"', '')
+        return html
+
     def rewrite_link(self, source, match):
         link = Link(match.group(0), self, source.output_path)
         self.links.add(link)
index 2e3903028fe61085a6480b92351c1f1199f272f2..1584a410791f16c868e39d7a8fd86bcf776835b4 100644 (file)
@@ -46,6 +46,7 @@ var turndownService = new TurndownService({
 turndownService.use(turndownPluginGfm.tables)
 turndownService.use(myPlugin.pre)
 turndownService.keep('iframe')
+turndownService.keep('video')
 //turndownService.use(myPlugin.preserveHTMLMatching)
 
 const html = fs.readFileSync(0, "utf-8");
index bf7718dd944484629366fc0ab849872997722547..69b9fa8b048851e228df6f0f3b1ad53a6f4f3c2f 100644 (file)
@@ -1,12 +1,53 @@
 import markdown2
 from bs4 import BeautifulSoup
 
+
 def post_process(soup):
-    pass
+    for img in soup.find_all("img"):
+        figure = soup.new_tag("figure")
+        figure['class'] = "wp-block-image"
+        img.wrap(figure)
+
+    for pre in soup.find_all("pre"):
+        pre['class'] = "wp-block-code"
+
+    for hr in soup.find_all("hr"):
+        hr['class'] = "wp-block-separator"
+
+    for table in soup.find_all("table"):
+        figure = soup.new_tag("figure")
+        figure['class'] = "wp-block-table"
+        table.wrap(figure)
+
+    for video in soup.find_all("video"):
+        figure = soup.new_tag("figure")
+        figure['class'] = "wp-block-video"
+        video.wrap(figure)
+
+    # YT embeds
+    for iframe in soup.find_all("iframe"):
+        if 'youtube.com' in iframe['src']:
+            figure = soup.new_tag("figure")
+            figure['class'] = "wp-block-embed"
+            iframe.wrap(figure)
+            
+    # <figure> inside <p> is illegal
+    for figure in soup.find_all("figure"):
+        parent = figure.parent
+        while parent.name in ["a"]:
+            parent = parent.parent
+        if parent.name == "p" and len(list(parent.children)) == 1:
+            parent.unwrap()
+
+    #for p in soup.select("blockquote > p"):
+    #    if len(p.find_previous_siblings("p")) > 0: continue
+    #    p['style'] = "color:#222222;"
+    #for p in soup.select("li > p"):
+    #    p.unwrap()
+    return soup
 
 def markdown2html(html):
     html = markdown2.markdown(html, extras=['tables', 'header-ids', 'fenced-code-blocks'])
-    return html
     soup = BeautifulSoup(html, 'html.parser')
     soup = post_process(soup)
-    return soup.prettify()
+    return str(soup) #.prettify()
index 31107a6237f086e2e4a24ef1f15b553ea09868c4..15f019a91c49e37491dbf6667e40373842d8af74 100644 (file)
@@ -1,25 +1,63 @@
 const puppeteer = require('puppeteer')
 const path = require('path')
 const arguments = process.argv
+const util = require('util');
+const exec = util.promisify(require('child_process').exec);
 
-async function compare(url1, url2) {
+function sleep(time) {
+    return new Promise((resolve) => setTimeout(resolve, time))
+}
+
+async function compare(url1, url2, save1, save2, diff) {
     const browser = await puppeteer.launch()
     const page = await browser.newPage()
     const options = {
         fullPage: true,
+        optimizeForSpeed: true,
     }
 
     await page.goto(`file://${path.join(__dirname, url1)}`)
+    options.path = save1
     const render1 = await page.screenshot(options)
 
     await page.goto(`file://${path.join(__dirname, url2)}`)
+    options.path = save2
     const render2 = await page.screenshot(options)
 
-    return !render1.compare(render2)
+
+    const command = `compare -compose src ${save1} ${save2} ${diff}`
+    try {
+        await exec(command)
+    } catch(err) {
+        if (err.code !== 1) throw err
+    }
+
+    const result = await exec(`magick ${diff} -alpha off -fill black +opaque "#cccccc" \\( +clone -evaluate set 0 \\) -metric AE -compare -format "%[distortion] %w %h\n" info:`)
+
+    const [pixelsSame, width, height] = result.stdout.trim().split(" ")
+
+    const identical = !render1.compare(render2)
+    const pixelsDifferent = height*width - pixelsSame
+    heightDifference = Math.abs(render1.length - render2.length)
+    var ret = {
+      identical,
+      heightDifference,
+      pixelsDifferent,
+      percentDifferent: pixelsDifferent / (pixelsDifferent + pixelsSame),
+      rowsDifferent: pixelsDifferent / width,
+      pixelsSame,
+      height,
+      width,
+    }
+    return ret
 }
 
 (async () => {
-var same = await compare(process.argv[2], process.argv[3]);
-if (same) process.exit(0)
-else process.exit(1)
+var ret = await compare(process.argv[2], process.argv[3], process.argv[4], process.argv[5], process.argv[6]);
+
+process.stdout.write(JSON.stringify(ret))
+process.stdout.write("\n")
+
+if (!ret.identical) process.exitCode = 1
+process.exit()
 })();
diff --git a/problem_survey.txt b/problem_survey.txt
new file mode 100644 (file)
index 0000000..f387751
--- /dev/null
@@ -0,0 +1,34 @@
+Problem Survey
+
+
+++++++++++++++ wp-code-block should be wp-block-code (installing-email-with-postfix-and-dovecot)
+++++++ gallery layout information is being dropped -- rows (banh-chung)
+image captions are being converted to <p> text. originals are:
+    + <figcaption> (painting)
+    + div.wp-block-cover
+    +++++++ p.wp-caption-text (whiteboard-partition)
+        wrapped inside div.wp-caption, which adds whitespace
+++ Original has no <p> after <img>, just unwrapped text (diy-keyboards-and-how-keyboards-work)
++ image alignright dropped (whiteboard-partition)
++++ yt embed missing a little surrounding space. Original was wrapped in a figure.wp-block-embed.is-type-video.is-provider-youtube.wp-block-embed-youtube.wp-embed-aspect-16-9.wp-has-aspect-ratio (relay-music)
++++++ img.alignnone lost, which was providing a border (a-pixel-art): dontfix
+++ img.aligncenter lost, which was providing centering (the-double-lives-of-books) and border (dontfix)
++ <br> after img.alignnone displaying differently (steak-tartare-3)
++ figure class_ is being set, not class (steak-tartare-3)
+++ image specified height and width (cron-email-and-sending-email-to-only-one-address)
++++ <p> with an empty link. this is a problem in the markdown->HTML step for image links (moreorcs-com.html)
+    +++ or empty <p> only (printable-todo-list)
+++ markdown bold breaking across lines. HTML->markdown broken (2020-books)
+++++++ markdown version has additional whitespace are <pre> inside <li> (mail-filtering-with-dovecot)
++ <figcaption> converted to <p> for table (understanding-gzip-2)
+++ figure.wp-block-table should wrap a table (understanding-gzip-2)
++ several images in a single <p> tag should be converted to a gallery row (default-twitter-icon)
++ markdown alt text is having markdown applied inside it. this is a bug in the markdown->HTML step (default-twitter-icon)
++ video got deleted (e-ink-laptop)
++ <p> being generated inside <li>, and adding too much space
+
++ <pre> got lost (archiving-twitter)
++ <pre> messed up (xp-boot-usb-stick) -- problem in markdown to html conversion, i think
++ make font bigger (hack-a-day-hack-a-hang)
++ literal * messing with bolding (2022-books)
++ Link inside quote should be bolded (the-bible-translated-to-the-new-latin)
index 25216338efd3b197fb80b8f20abc9f5a962bdf30..f0a6f183db6145368693a846f874784e37e697dd 100644 (file)
@@ -35,7 +35,7 @@
 {{#main_display}}
 {{#comments}}
 <div id="comments">
-    <h3 id="comments-title">Responses to <em>{{title}}</em></h3>
+    <h3 id="comments-title">Responses to <em>{{post_title}}</em></h3>
     {{& comments }}
 </div>
 {{/comments}}
index 85f2a3f80b8bb8bb6a0fa407b47a306a880a105a..31d252e77297d65ec1696423ff50ccc4816a7018 100644 (file)
@@ -62,8 +62,14 @@ iframe {
 <body>
 <ul id="links">
     <a href="https://blog.za3k.com/{{id}}/">original blog</a>
-    <a href="https://git.za3k.com/?p=blog.git;a=blob_plain;f=posts-html/{{id}}.html;hb=HEAD">html source</a>
-    <a href="https://git.za3k.com/?p=blog.git;a=blob_plain;f=posts-md/{{id}}.md;hb=HEAD">markdown source</a>
+    <span><a href="https://git.za3k.com/?p=blog.git;a=blob_plain;f=posts-html/{{id}}.html;hb=HEAD">html source</a> <a href="file:///home/zachary/blog/posts-html/{{id}}.html">(local)</a></span>
+    <span><a href="https://git.za3k.com/?p=blog.git;a=blob_plain;f=posts-md/{{id}}.md;hb=HEAD">markdown source</a> <a href="file:///home/zachary/blog/posts-md/{{id}}.md">(local)</a></span>
+    <span>
+        Images:
+        <a href="file:///home/zachary/blog/screenshots/{{id}}.diff.png">diff</a> / 
+        <a href="file:///home/zachary/blog/screenshots/{{id}}.html.png">html</a> / 
+        <a href="file:///home/zachary/blog/screenshots/{{id}}.md.png">md</a>
+    </span>
 </ul>
 <div id="iframes">
     <div id="html">
diff --git a/visualdiff-display.py b/visualdiff-display.py
new file mode 100644 (file)
index 0000000..4b32c6b
--- /dev/null
@@ -0,0 +1,49 @@
+"""
+Display top problems
+"""
+
+import csv
+import json
+import multiprocessing
+import os
+import pathlib
+import subprocess
+import sys
+import time
+
+tqdm = lambda x, **kw: x
+if sys.stdout.isatty():
+    try:
+        from tqdm import tqdm
+    except ImportError:
+        pass
+
+def unsorted_parallel_map(f, lst, n=10):
+    with multiprocessing.Pool(n) as p:
+        yield from tqdm(p.imap_unordered(f, lst), total=len(lst))
+
+
+def sort_order(x):
+    return [
+        x["pixel-perfect?"] == "False",
+        int(x["height-difference"]),
+        int(x["pixels-different"]),
+        x["post-id"],
+    ]
+
+def open_webpage(url):
+    subprocess.run(["chromium", url])
+
+if __name__ == "__main__":
+    with open("visual-diff.csv", "r") as _csv:
+        csvfile = csv.DictReader(_csv, dialect="excel")
+        #csvfile.writerow(["post-id", "url", "local-url", "pixel-perfect?", "html-screenshot", "markdown-screenshot", "diff-screenshot", "height-difference", "pixels-different"])
+        rows = list(csvfile)
+
+    ordered = sorted(rows, key=sort_order, reverse=True)
+    ordered = [x for x in ordered if x["pixel-perfect?"] == "False"]
+    #for x in ordered[:10]:
+    #    print(x["post-id"], x["height-difference"], x["pixels-different"], x["pixel-perfect?"])
+    for x in ordered:#[:30]:
+        print(x["local-url"])
+        open_webpage(x["local-url"])
index 7b7846998cad7337074d15d9700cd54882409cee..c4c7e34a00e79621e210014f58b7cc500a643b86 100644 (file)
@@ -4,12 +4,13 @@ Assumes they are already generated
 """
 
 import csv
-import pathlib
-import subprocess
-import time
+import json
 import multiprocessing
 import os
+import pathlib
+import subprocess
 import sys
+import time
 
 tqdm = lambda x, **kw: x
 if sys.stdout.isatty():
@@ -22,50 +23,57 @@ def unsorted_parallel_map(f, lst, n=10):
     with multiprocessing.Pool(n) as p:
         yield from tqdm(p.imap_unordered(f, lst), total=len(lst))
 
-def pixel_compare(path1, path2):
-    return subprocess.run(["node", "pixel-compare.js", path1, path2], stderr=subprocess.DEVNULL).returncode == 0
+def pixel_compare(*args):
+    result = subprocess.run(["node", "pixel-compare.js"] + list(args), capture_output=True)
+    identical = result.returncode == 0
+    output = result.stdout.decode('utf8')
+    try:
+        result = json.loads(output)
+    except:
+        print(args, output, file=sys.stderr)
+        raise
 
-def html_compare(path1, path2):
-    return False
+    return identical, result
 
 def blog_articles():
     return sorted(x.stem for x in pathlib.Path("posts-html").iterdir())
 
-def compare(post_id):
+def compare(post_id, save=False):
     url = "https://blog2.za3k.com/posts/{}.html".format(post_id)
+    local_url = "file:///home/zachary/blog/public/posts/{}.html".format(post_id)
     html_path = "public/posts/{}.orig.html".format(post_id)
     markdown_path = "public/posts/{}.md.html".format(post_id)
-    pixel_identical = pixel_compare(html_path, markdown_path)
-    html_identical = html_compare(html_path, markdown_path)
-    return [post_id, url, pixel_identical, html_identical]
+    html_screenshot_path = "screenshots/{}.html.png".format(post_id)
+    markdown_screenshot_path = "screenshots/{}.md.png".format(post_id)
+    visual_diff_path = "screenshots/{}.diff.png".format(post_id)
+    pixel_identical, ret = pixel_compare(html_path, markdown_path, html_screenshot_path, markdown_screenshot_path, visual_diff_path)
+    return [post_id, url, local_url, pixel_identical, html_screenshot_path, markdown_screenshot_path, visual_diff_path, ret["heightDifference"], ret["pixelsDifferent"]]
 
 if __name__ == "__main__":
 
     start_time = time.time()
     with open("visual-diff.csv", "w") as _csv:
         csvfile = csv.writer(_csv, dialect="excel")
-        csvfile.writerow(["post-id", "url", "pixel-perfect?", "html-identical?"])
+        csvfile.writerow(["post-id", "url", "local-url", "pixel-perfect?", "html-screenshot", "markdown-screenshot", "diff-screenshot", "height-difference", "pixels-different"])
         rows = sorted(unsorted_parallel_map(compare, blog_articles()))
         csvfile.writerows(rows)
     time_elapsed = time.time() - start_time
 
-    both_identical = len([x for x in rows if x[2] and x[3]])
-    pixel_identical = len([x for x in rows if x[2]]) - both_identical
-    html_identical = len([x for x in rows if x[3]]) - both_identical
     total = len(rows)
-    neither_identical = total - pixel_identical - html_identical - both_identical
+    pixel_identical = len([x for x in rows if x[3]])
+    not_identical = total - pixel_identical
+    example_failure = sorted([x[0] for x in rows if not x[3]])[0]
 
     print("        Progress Tracker\n")
-    print("           pixel:NO   pixel: YES ")
+    print("            DIFFERENT      SAME    ")
     print("          |-----------|-----------|")
-    print(" html:NO  |    {: >2.0f}%    |    {: >2.0f}%    |    {: >2.0f}%".format(neither_identical/total*100, pixel_identical/total*100, (neither_identical + pixel_identical)/total*100))
+    print("          |    {: >3.0f}%   |    {: >3.0f}%   | 100%".format(not_identical/total*100, pixel_identical/total*100))
     print("          |-----------|-----------|")
-    print(" html:YES |    {: >2.0f}%    |    {: >2.0f}%    |    {: >2.0f}%".format(html_identical/total*100, both_identical/total*100, (html_identical + both_identical)/total*100))
+    print("          |    {: >3d}    |    {: >3d}    | {: >3d}".format(not_identical, pixel_identical, total))
     print("          |-----------|-----------|")
-    print("               {: >2.0f}%    {: >2.0f}%".format((neither_identical + html_identical)/total*100, (pixel_identical+both_identical)/total*100))
     print()
-    print("Posts: {}".format(total))
     print("Time: {:.0f}s".format(time_elapsed))
     print("Time per file: {:.2f}s".format(time_elapsed/total))
+    print("Next failure: {}".format(example_failure))
 
     os.system("rm -r /tmp/puppeteer_dev_chrome_profile-X*")
index 7b6acec2a9f9075544b198b9b103e5ca920cd28e..54bcd5341d8edfa72f9b43963d150a9d3a9b3cd8 100644 (file)
@@ -5,7 +5,7 @@ import yaml
 from pathlib import Path
 
 INPUT_DIR = Path("/home/zachary/blog.za3k.com")
-OUTPUT_DIR = Path("/home/zachary/blog/posts")
+OUTPUT_DIR = Path("/home/zachary/blog/posts-html")
 IMAGES = OUTPUT_DIR / 'images'
 
 BLACKLIST={"wp-json", "feed"}
@@ -23,9 +23,14 @@ def parse_date(s):
 def scrape_post(post):
     html = bs4.BeautifulSoup(post, 'html.parser')
     article = html.find('article')
-    comments = html.find('ol', class_="commentlist")
     post = article.find('div', class_="entry-content")
 
+    for x in html.select('ol.commentlist > li.pingback'):
+        x.extract()
+    comments = html.find('ol', class_="commentlist")
+    if comments and len(comments.find_all('li')) == 0:
+        comments = None
+
     result = {}
     result["html_content"] = str(post)
     result["html_comments"] = (str(comments) if comments else "")