feat: Add support for "word" files exported from Confluence (#1600)

* Display error message to end user * fix: Improve conversion of tables * fix: Characters at ends of lines in tables lost
2020-10-21 08:53:59 -07:00
parent b78e2f1e05
commit df7b9f3e88
9 changed files with 394 additions and 3 deletions
--- a/server/commands/documentImporter.js
+++ b/server/commands/documentImporter.js
@ -1,11 +1,14 @@
 // @flow
 import fs from "fs";
 import File from "formidable/lib/file";
+import { strikethrough, tables } from "joplin-turndown-plugin-gfm";
 import mammoth from "mammoth";
+import quotedPrintable from "quoted-printable";
 import TurndownService from "turndown";
+import utf8 from "utf8";
 import uuid from "uuid";
 import parseTitle from "../../shared/utils/parseTitle";
-import { InvalidRequestError } from "../errors";
+import { FileImportError, InvalidRequestError } from "../errors";
 import { Attachment, Event, User } from "../models";
 import dataURItoBuffer from "../utils/dataURItoBuffer";
 import parseImages from "../utils/parseImages";
@ -18,12 +21,28 @@ const turndownService = new TurndownService({
  headingStyle: "atx",
 });

+// Use the GitHub-flavored markdown plugin to parse
+// strikethoughs and tables
+turndownService
+  .use(strikethrough)
+  .use(tables)
+  .addRule("breaks", {
+    filter: ["br"],
+    replacement: function (content) {
+      return "\n";
+    },
+  });
+
 interface ImportableFile {
  type: string;
  getMarkdown: (file: any) => Promise<string>;
 }

 const importMapping: ImportableFile[] = [
+  {
+    type: "application/msword",
+    getMarkdown: confluenceToMarkdown,
+  },
  {
    type:
      "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -57,6 +76,60 @@ async function htmlToMarkdown(file): Promise<string> {
  return turndownService.turndown(value);
 }

+async function confluenceToMarkdown(file): Promise<string> {
+  let value = await fs.promises.readFile(file.path, "utf8");
+
+  // We're only supporting the ridiculous output from Confluence here, regular
+  // Word documents should call into the docxToMarkdown importer.
+  // See: https://jira.atlassian.com/browse/CONFSERVER-38237
+  if (!value.includes("Content-Type: multipart/related")) {
+    throw new FileImportError("Unsupported Word file");
+  }
+
+  // get boundary marker
+  const boundaryMarker = value.match(/boundary="(.+)"/);
+  if (!boundaryMarker) {
+    throw new FileImportError("Unsupported Word file (No boundary marker)");
+  }
+
+  // get content between multipart boundaries
+  let boundaryReached = 0;
+  const lines = value.split("\n").filter((line) => {
+    if (line.includes(boundaryMarker[1])) {
+      boundaryReached++;
+      return false;
+    }
+    if (line.startsWith("Content-")) {
+      return false;
+    }
+
+    // 1 == definition
+    // 2 == content
+    // 3 == ending
+    if (boundaryReached === 2) {
+      return true;
+    }
+    return false;
+  });
+
+  if (!lines.length) {
+    throw new FileImportError("Unsupported Word file (No content found)");
+  }
+
+  // Mime attachment is "quoted printable" encoded, must be decoded first
+  // https://en.wikipedia.org/wiki/Quoted-printable
+  value = utf8.decode(quotedPrintable.decode(lines.join("\n")));
+
+  // If we don't remove the title here it becomes printed in the document
+  // body by turndown
+  turndownService.remove(["style", "xml", "title"]);
+
+  // Now we should have something that looks like HTML
+  const html = turndownService.turndown(value);
+
+  return html.replace(/<br>/g, " \\n ");
+}
+
 export default async function documentImporter({
  file,
  user,