#!/bin/bash # sync-gwern.net.sh: shell script which automates a full build and sync of Gwern.net. A simple build # can be done using 'runghc hakyll.hs build', but that is slow, semi-error-prone (did you # remember to delete all intermediates?), and does no sanity checks or optimizations like compiling # the MathJax to static CSS/fonts (avoiding multi-second JS delays). # # This script automates all of that: it cleans up, compiles a hakyll binary for faster compilation, # generates a sitemap XML file, optimizes the MathJax use, checks for many kinds of errors, uploads, # and cleans up. # # Author: Gwern Branwen # Date: 2016-10-01 # When: Time-stamp: "2019-09-15 14:38:14 gwern" # License: CC-0 bold () { echo -e "\033[1m$@\033[0m"; } red () { echo -e "\e[41m$@\e[0m"; } ## function to wrap checks and print red-highlighted warning if non-zero output (self-documenting): wrap () { OUTPUT=$($1 2>&1) WARN="$2" if [ -n "$OUTPUT" ]; then red "$WARN"; echo -e "$OUTPUT"; fi; } eg () { egrep --color=always "$@"; } gf () { fgrep --color=always "$@"; } # key dependencies: GHC, Hakyll, s3cmd, emacs, curl, tidy (HTML5 version), urlencode # ('gridsite-clients' package), linkchecker, fdupes, ImageMagick, exiftool, mathjax-node-page (eg. # `npm i -g mathjax-node-page`), parallel, xargs, php7… if ! [[ -n $(command -v ghc) && -n $(command -v git) && -n $(command -v rsync) && -n $(command -v curl) && -n $(command -v ping) && \ -n $(command -v tidy) && -n $(command -v linkchecker) && -n $(command -v du) && -n $(command -v rm) && -n $(command -v find) && \ -n $(command -v fdupes) && -n $(command -v urlencode) && -n $(command -v sed) && -n $(command -v parallel) && -n $(command -v xargs) && \ -n $(command -v file) && -n $(command -v exiftool) && -n $(command -v identify) && -n $(command -v pdftotext) && \ -n $(command -v ~/src/node_modules/mathjax-node-page/bin/mjpage) && -n $(command -v static/build/link-extractor.hs) && \ -n $(command -v static/build/anchor-checker.php) && -n $(command -v php) && -n $(command -v static/build/generateDirectory.hs) && \ -n $(command -v static/build/generateLinkBibliography.hs) && \ -n $(command -v static/build/generateBacklinks.hs) ]] && \ -n $(command -v static/build/generateSimilarLinks.hs) ]] && \ [ -z "$(pgrep hakyll)" ]; then red "Dependencies missing or Hakyll already running?" else set -e # lower priority of everything we run (some of it is expensive): renice -n 15 "$$" &>/dev/null ## Parallelization: WARNING: post-2022-03 Hakyll uses parallelism which catastrophically slows down at >= # of physical cores; see N="$(if [ ${#} == 0 ]; then echo 31; else echo "$1"; fi)" (cd ~/wiki/ && git status || true) & bold "Pulling infrastructure updates…" (cd ./static/ && git status && timeout 10m git pull --verbose 'https://gwern.obormot.net/static/.git/' master || true) bold "Executing string rewrite cleanups…" # automatically clean up some Gwern.net bad URL patterns, typos, inconsistencies, house-styles: (gwsed 'https://mobile.twitter.com' 'https://twitter.com' & gwsed 'https://twitter.com/' 'https://nitter.hu/' & gwsed 'https://mobile.twitter.com/' 'https://nitter.hu/' & gwsed 'https://www.twitter.com/' 'https://nitter.hu/' & gwsed 'https://www.reddit.com/r/' 'https://old.reddit.com/r/' & gwsed 'https://en.m.wikipedia.org/' 'https://en.wikipedia.org/' & gwsed 'https://www.greaterwrong.com/posts/' 'https://www.lesswrong.com/posts' & gwsed '&hl=en' '' & gwsed '?hl=en&' '?' & gwsed '?hl=en' '' & gwsed '?usp=sharing' '' & gwsed '

' '

' & gwsed 'EMBASE' 'Embase' & gwsed 'Medline' 'MEDLINE' & gwsed 'PsychINFO' 'PsycINFO' & gwsed 'http://web.archive.org/web/' 'https://web.archive.org/web/' & gwsed 'https://youtu.be/' 'https://www.youtube.com/watch?v=' & gwsed '.html?pagewanted=all' '.html' & gwsed '(ie,' '(ie.' & gwsed '(ie ' '(ie. ' & gwsed '(i.e.,' '(ie.' & gwsed 'ie., ' 'ie. ' & gwsed '(i.e.' '(ie.' & gwsed '(eg, ' '(eg. ' & gwsed ' eg ' ' eg. ' & gwsed '(eg ' '(eg. ' & gwsed '[eg ' '[eg. ' & gwsed 'e.g. ' 'eg. ' & gwsed ' e.g. ' ' eg. ' & gwsed 'e.g.,' 'eg.' & gwsed 'eg.,' 'eg.' & gwsed ']^[' '] ^[' & gwsed ' et al., ' ' et al ' & gwsed 'et al., ' 'et al ' & gwsed '(cf ' '(cf. ' & gwsed ' cf ' ' cf. ' & gwsed ' _n_s' ' ns' & gwsed ' (n = ' ' (n = ' & gwsed ' (N = ' ' (n = ' & gwsed 'St' 'st' & gwsed 'Th' 'th' & gwsed 'Rd' 'rd' & gwsed ' de novo ' ' de novo ' & gwsed ' De Novo ' ' De Novo ' & gwsed ', Jr.' ' Junior' & gwsed ' Jr.' ' Junior' & gwsed ', Junior' ' Junior' & gwsed '.full-text' '.full' & gwsed '.full.full' '.full' & gwsed '.full-text' '.full' & gwsed '.full-text.full' '.full' & gwsed '.full.full.full' '.full' & gwsed '.full.full' '.full' & gwsed '#allen#allen' '#allen' & gwsed '#deepmind#deepmind' '#deepmind' & gwsed '&org=deepmind&org=deepmind' '&org=deepmind' & gwsed '#nvidia#nvidia' '#nvidia' & gwsed '#openai#openai' '#openai' & gwsed '#google#google' '#google' & gwsed '#uber#uber' '#uber' & gwsed 'MSCOCO' 'MS COCO' & gwsed '&feature=youtu.be' '' & gwsed 'Rene Girard' 'René Girard' & gwsed 'facebookok' 'facebook' & gwsed ':443/' '/' & gwsed 'border colly' 'border collie' & gwsed ':80/' '/' & gwsed '.gov/labs/pmc/articles/P' '.gov/pmc/articles/P' & gwsed 'rjlipton.wpcomstaging.com' 'rjlipton.wordpress.com' & gwsed '?s=r' '' & gwsed 'backlinks-not' 'backlink-not') &> /dev/null wait bold "Compiling…" cd ./static/build compile () { ghc -O2 -Wall -rtsopts -threaded --make "$@"; } compile hakyll.hs compile generateLinkBibliography.hs compile generateDirectory.hs compile preprocess-markdown.hs & ## NOTE: generateSimilarLinks.hs & link-suggester.hs are done at midnight by a cron job because ## they are too slow to run during a regular site build & don't need to be super-up-to-date ## anyway cd ../../ cp ./metadata/auto.yaml "/tmp/auto-$(date +%s).yaml.bak" || true # backup in case of corruption cp ./metadata/archive.hs "/tmp/archive-$(date +%s).hs.bak" bold "Checking embeddings database…" ghci -i/home/gwern/wiki/static/build/ static/build/GenerateSimilar.hs -e 'e <- readEmbeddings' &>/dev/null && cp ./metadata/embeddings.bin "/tmp/embeddings-$(date +%s).bin.bak" # duplicates a later check but if we have a fatal link error, we'd rather find out now rather than 30 minutes later while generating annotations: λ(){ fgrep -e 'href=""' -- ./metadata/*.yaml || true; } wrap λ "Malformed empty link in annotations?" # We update the linkSuggestions.el in a cron job because too expensive, and vastly slows down build. # Update the directory listing index pages: there are a number of directories we want to avoid, # like the various mirrors or JS projects, or directories just of data like CSVs, or dumps of # docs, so we'll blacklist those: bold "Building directory indexes…" ./static/build/generateDirectory +RTS -N"$N" -RTS \ $(find docs/ fiction/ haskell/ newsletter/ nootropics/ notes/ reviews/ zeo/ -type d \ | sort | fgrep -v -e 'docs/www' -e 'docs/rotten.com' -e 'docs/genetics/selection/www.mountimprobable.com' \ -e 'docs/biology/2000-iapac-norvir' -e 'docs/gwern.net-gitstats' -e 'docs/rl/armstrong-controlproblem' \ -e 'docs/statistics/order/beanmachine-multistage' \ -e 'docs/link-bibliography' | shuf) # we want to generate all directories first before running Hakyll in case a new tag was created bold "Updating link bibliographies…" ./static/build/generateLinkBibliography +RTS -N"$N" -RTS $(find . -type f -name "*.page" | sort | fgrep -v -e 'index.page' -e '404.page' -e 'docs/link-bibliography/' | sed -e 's/\.\///' | shuf) & bold "Check/update VCS…" cd ./static/ && (git status; git pull; git push --verbose &) cd ./build/ # Cleanup pre: rm --recursive --force -- ~/wiki/_cache/ ~/wiki/_site/ ./static/build/hakyll ./static/build/*.o ./static/build/*.hi ./static/build/generateDirectory ./static/build/generateLinkBibliography ./static/build/generateBacklinks ./static/build/link-extractor ./static/build/link-suggester || true cd ../../ # go to site root bold "Building site…" time ./static/build/hakyll build +RTS -N"$N" -RTS || (red "Hakyll errored out!"; exit 1) bold "Results size…" du -chs ./_cache/ ./_site/; find ./_site/ -type f | wc --lines # cleanup post: rm -- ./static/build/hakyll ./static/build/*.o ./static/build/*.hi ./static/build/generateDirectory ./static/build/generateLinkBibliography ./static/build/generateBacklinks ./static/build/link-extractor &>/dev/null || true ## WARNING: this is a crazy hack to insert a horizontal rule 'in between' the first 3 sections ## on /index (Newest/Popular/Notable), and the rest (starting with Statistics); the CSS for ## making the rule a block dividing the two halves just doesn't work in any other way, but ## Pandoc Markdown doesn't let you write stuff 'in between' sections, either. So… a hack. sed -i -e 's/section id=\"statistics\"/hr class="horizontalRule-nth-1" \/>

" ## very static files which rarely change: PDFs, images, site infrastructure: find -L _site/docs/ _site/images/ _site/static/ -not -name "*.page" -type f | fgrep --invert-match -e 'docs/www/' -e 'docs/link-bibliography' -e 'metadata/' -e '.git' -e '404' -e '/static/templates/default.html' -e '/docs/personal/index' | \ sort | xargs urlencode -m | sed -e 's/%20/\n/g' | \ sed -e 's/_site\/\(.*\)/\\https:\/\/www\.gwern\.net\/\1<\/loc>never<\/changefreq><\/url>/' ## Everything else changes once in a while: find -L _site/ -not -name "*.page" -type f | fgrep --invert-match -e 'static/' -e 'docs/' -e 'images/' -e 'Fulltext' -e 'metadata/' -e '-768px.' | \ sort | xargs urlencode -m | sed -e 's/%20/\n/g' | \ sed -e 's/_site\/\(.*\)/\\https:\/\/www\.gwern\.net\/\1<\/loc>monthly<\/changefreq><\/url>/' echo "") >> ./_site/sitemap.xml ## generate a syntax-highlighted HTML fragment (not whole standalone page) version of source code files for popup usage: ### We skip .json/.jsonl/.csv because they are too large & Pandoc will choke; and we truncate at 1000 lines because such ### long source files are not readable as popups and their complexity makes browsers choke while rendering them. ### (We include plain text files in this in order to get truncated versions of them.) bold "Generating syntax-highlighted versions of source code files…" syntaxHighlight () { #### NOTE: for each new extension, add a `find` name, and an entry in `extracts-content.js` declare -A extensionToLanguage=( ["R"]="R" ["c"]="C" ["py"]="Python" ["css"]="CSS" ["hs"]="Haskell" ["js"]="Javascript" ["patch"]="Diff" ["diff"]="Diff" ["sh"]="Bash" ["bash"]="Bash" ["html"]="HTML" ["conf"]="Bash" ["php"]="PHP" ["opml"]="Xml" ["xml"]="Xml" ["page"]="Markdown" ["txt"]="" ["yaml"]="YAML" ["jsonl"]="JSON" ["json"]="JSON" ["csv"]="CSV" ) for FILE in "$@"; do FILEORIGINAL=$(echo "$FILE" | sed -e 's/_site//') FILENAME=$(basename -- "$FILE") EXTENSION="${FILENAME##*.}" LANGUAGE=${extensionToLanguage[$EXTENSION]} FILELENGTH=$(cat "$FILE" | wc --lines) (echo -e "~~~{.$LANGUAGE}"; if [ $EXTENSION == "page" ]; then # the very long lines look bad in narrow popups, so we fold: cat "$FILE" | fold --spaces --width=65 | head -1100 | iconv -t utf8 -c; else cat "$FILE" | head -1000; fi echo -e "\n~~~" if (( $FILELENGTH >= 1000 )); then echo -e "\n\n…[File truncated due to length; see original file]…"; fi; ) | pandoc --mathjax --write=html5 --from=markdown+smart >> $FILE.html done } export -f syntaxHighlight set +e find _site/static/ -type f,l -name "*.html" | sort | parallel --jobs 25 syntaxHighlight # NOTE: run .html first to avoid duplicate files like 'foo.js.html.html' find _site/ -type f,l -name "*.R" -or -name "*.c" -or -name "*.css" -or -name "*.hs" -or -name "*.js" -or -name "*.patch" -or -name "*.diff" -or -name "*.py" -or -name "*.sh" -or -name "*.bash" -or -name "*.php" -or -name "*.conf" -or -name "*.opml" -or -name "*.page" -or -name "*.txt" -or -name "*.json" -or -name "*.jsonl" -or -name "*.yaml" -or -name "*.xml" -or -name "*.csv" | \ sort | fgrep -v \ `# Pandoc fails on embedded Unicode/regexps in JQuery` \ -e 'mountimprobable.com/assets/app.js' -e 'jquery.min.js' \ -e 'metadata/backlinks.hs' -e 'metadata/embeddings.bin' -e 'metadata/archive.hs' -e 'docs/www/' | parallel --jobs 25 syntaxHighlight set -e bold "Stripping compile-time-only classes unnecessary at runtime…" cleanClasses () { sed -i -e 's/class=\"\(.*\)archive-local \?/class="\1/g' \ -e 's/class=\"\(.*\)archive-not \?/class="\1/g' \ -e 's/class=\"\(.*\)backlink-not \?/class="\1/g' \ -e 's/class=\"\(.*\)id-not \?/class="\1/g' \ -e 's/class=\"\(.*\)link-annotated-not \?/class="\1/g' \ -e 's/class=\"\(.*\)link-live-not \?/class="\1/g' \ -e 's/ data-embedding[-Dd]istance="0.[0-9]\+"//' \ -e 's/ data-link[-Tt]ags="[a-z0-9 \/-]\+">/>/' \ "$@"; }; export -f cleanClasses find ./ -path ./_site -prune -type f -o -name "*.page" | fgrep -v -e '#' | sort | sed -e 's/\.page$//' -e 's/\.\/\(.*\)/_site\/\1/' | parallel --max-args=100 cleanClasses || true find ./_site/metadata/ -type f -name "*.html" | sort | parallel --max-args=100 cleanClasses || true ## Pandoc/Skylighting by default adds empty self-links to line-numbered code blocks to make them clickable (as opposed to just setting a span ID, which it also does). These links *would* be hidden except that self links get marked up with up/down arrows, so arrows decorate the codeblocks. We have no use for them and Pandoc/skylighting has no option or way to disable them, so we strip them. bold "Stripping self-links from syntax-highlighted HTML…" cleanCodeblockSelflinks () { if [[ $(fgrep -e 'class="sourceCode' "$@") ]]; then sed -i -e 's/