// @flow import fs from "fs"; import path from "path"; import File from "formidable/lib/file"; import { strikethrough, tables } from "joplin-turndown-plugin-gfm"; import mammoth from "mammoth"; import quotedPrintable from "quoted-printable"; import TurndownService from "turndown"; import utf8 from "utf8"; import parseTitle from "../../shared/utils/parseTitle"; import { FileImportError, InvalidRequestError } from "../errors"; import { User } from "../models"; import dataURItoBuffer from "../utils/dataURItoBuffer"; import { deserializeFilename } from "../utils/fs"; import parseImages from "../utils/parseImages"; import attachmentCreator from "./attachmentCreator"; // https://github.com/domchristie/turndown#options const turndownService = new TurndownService({ hr: "---", bulletListMarker: "-", headingStyle: "atx", }); // Use the GitHub-flavored markdown plugin to parse // strikethoughs and tables turndownService .use(strikethrough) .use(tables) .addRule("breaks", { filter: ["br"], replacement: function (content) { return "\n"; }, }); interface ImportableFile { type: string; getMarkdown: (file: any) => Promise; } const importMapping: ImportableFile[] = [ { type: "application/msword", getMarkdown: confluenceToMarkdown, }, { type: "application/octet-stream", getMarkdown: docxToMarkdown, }, { type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document", getMarkdown: docxToMarkdown, }, { type: "text/html", getMarkdown: htmlToMarkdown, }, { type: "text/plain", getMarkdown: fileToMarkdown, }, { type: "text/markdown", getMarkdown: fileToMarkdown, }, ]; async function fileToMarkdown(file): Promise { return fs.promises.readFile(file.path, "utf8"); } async function docxToMarkdown(file): Promise { const { value } = await mammoth.convertToHtml(file); return turndownService.turndown(value); } async function htmlToMarkdown(file): Promise { const value = await fs.promises.readFile(file.path, "utf8"); return turndownService.turndown(value); } async function confluenceToMarkdown(file): Promise { let value = await fs.promises.readFile(file.path, "utf8"); // We're only supporting the ridiculous output from Confluence here, regular // Word documents should call into the docxToMarkdown importer. // See: https://jira.atlassian.com/browse/CONFSERVER-38237 if (!value.includes("Content-Type: multipart/related")) { throw new FileImportError("Unsupported Word file"); } // get boundary marker const boundaryMarker = value.match(/boundary="(.+)"/); if (!boundaryMarker) { throw new FileImportError("Unsupported Word file (No boundary marker)"); } // get content between multipart boundaries let boundaryReached = 0; const lines = value.split("\n").filter((line) => { if (line.includes(boundaryMarker[1])) { boundaryReached++; return false; } if (line.startsWith("Content-")) { return false; } // 1 == definition // 2 == content // 3 == ending if (boundaryReached === 2) { return true; } return false; }); if (!lines.length) { throw new FileImportError("Unsupported Word file (No content found)"); } // Mime attachment is "quoted printable" encoded, must be decoded first // https://en.wikipedia.org/wiki/Quoted-printable value = utf8.decode(quotedPrintable.decode(lines.join("\n"))); // If we don't remove the title here it becomes printed in the document // body by turndown turndownService.remove(["style", "xml", "title"]); // Now we should have something that looks like HTML const html = turndownService.turndown(value); return html.replace(/
/g, " \\n "); } export default async function documentImporter({ file, user, ip, }: { user: User, file: File, ip: string, }): Promise<{ text: string, title: string }> { const fileInfo = importMapping.filter((item) => { if (item.type === file.type) { if ( file.type === "application/octet-stream" && path.extname(file.name) !== ".docx" ) { return false; } return true; } if (item.type === "text/markdown" && path.extname(file.name) === ".md") { return true; } return false; })[0]; if (!fileInfo) { throw new InvalidRequestError(`File type ${file.type} not supported`); } let title = deserializeFilename(file.name.replace(/\.[^/.]+$/, "")); let text = await fileInfo.getMarkdown(file); // If the first line of the imported text looks like a markdown heading // then we can use this as the document title if (text.trim().startsWith("# ")) { const result = parseTitle(text); title = result.title; text = text.replace(`# ${title}\n`, ""); } // find data urls, convert to blobs, upload and write attachments const images = parseImages(text); const dataURIs = images.filter((href) => href.startsWith("data:")); for (const uri of dataURIs) { const name = "imported"; const { buffer, type } = dataURItoBuffer(uri); const attachment = await attachmentCreator({ name, type, buffer, user, ip, }); text = text.replace(uri, attachment.redirectUrl); } return { text, title }; }