From df7b9f3e888f771fb66d6e951e5c1ce5ac13d9c3 Mon Sep 17 00:00:00 2001 From: Tom Moor Date: Wed, 21 Oct 2020 08:53:59 -0700 Subject: [PATCH] feat: Add support for "word" files exported from Confluence (#1600) * Display error message to end user * fix: Improve conversion of tables * fix: Characters at ends of lines in tables lost --- app/components/DropToImport.js | 6 +- app/stores/DocumentsStore.js | 1 + package.json | 5 +- server/commands/documentImporter.js | 75 ++++++- server/commands/documentImporter.test.js | 19 ++ server/errors.js | 6 + server/test/fixtures/confluence.doc | 242 +++++++++++++++++++++++ server/test/fixtures/tables.html | 26 +++ yarn.lock | 17 ++ 9 files changed, 394 insertions(+), 3 deletions(-) create mode 100644 server/test/fixtures/confluence.doc create mode 100644 server/test/fixtures/tables.html diff --git a/app/components/DropToImport.js b/app/components/DropToImport.js index 54b936f6..5d0deaa7 100644 --- a/app/components/DropToImport.js +++ b/app/components/DropToImport.js @@ -7,6 +7,7 @@ import Dropzone from "react-dropzone"; import { withRouter, type RouterHistory, type Match } from "react-router-dom"; import { createGlobalStyle } from "styled-components"; import DocumentsStore from "stores/DocumentsStore"; +import UiStore from "stores/UiStore"; import LoadingIndicator from "components/LoadingIndicator"; const EMPTY_OBJECT = {}; @@ -18,6 +19,7 @@ type Props = { documentId?: string, activeClassName?: string, rejectClassName?: string, + ui: UiStore, documents: DocumentsStore, disabled: boolean, location: Object, @@ -71,6 +73,8 @@ class DropToImport extends React.Component { this.props.history.push(doc.url); } } + } catch (err) { + this.props.ui.showToast(`Could not import file. ${err.message}`); } finally { this.isImporting = false; importingLock = false; @@ -109,4 +113,4 @@ class DropToImport extends React.Component { } } -export default inject("documents")(withRouter(DropToImport)); +export default inject("documents", "ui")(withRouter(DropToImport)); diff --git a/app/stores/DocumentsStore.js b/app/stores/DocumentsStore.js index 1ddada7e..512b4104 100644 --- a/app/stores/DocumentsStore.js +++ b/app/stores/DocumentsStore.js @@ -22,6 +22,7 @@ export default class DocumentsStore extends BaseStore { "text/markdown", "text/plain", "text/html", + "application/msword", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ]; diff --git a/package.json b/package.json index 838db575..1a393b54 100644 --- a/package.json +++ b/package.json @@ -97,6 +97,7 @@ "invariant": "^2.2.2", "ioredis": "^4.14.1", "isomorphic-fetch": "2.2.1", + "joplin-turndown-plugin-gfm": "^1.0.12", "js-search": "^1.4.2", "json-loader": "0.5.4", "jsonwebtoken": "^8.5.0", @@ -126,6 +127,7 @@ "pg-hstore": "^2.3.3", "polished": "3.6.5", "query-string": "^4.3.4", + "quoted-printable": "^1.0.1", "randomstring": "1.1.5", "raw-loader": "^0.5.1", "react": "^16.8.6", @@ -158,6 +160,7 @@ "tiny-cookie": "^2.3.1", "tmp": "0.0.33", "turndown": "^6.0.0", + "utf8": "^2.1.0", "uuid": "2.0.2", "validator": "5.2.0" }, @@ -195,4 +198,4 @@ "js-yaml": "^3.13.1" }, "version": "0.48.1" -} \ No newline at end of file +} diff --git a/server/commands/documentImporter.js b/server/commands/documentImporter.js index 4410914f..26c1c121 100644 --- a/server/commands/documentImporter.js +++ b/server/commands/documentImporter.js @@ -1,11 +1,14 @@ // @flow import fs from "fs"; import File from "formidable/lib/file"; +import { strikethrough, tables } from "joplin-turndown-plugin-gfm"; import mammoth from "mammoth"; +import quotedPrintable from "quoted-printable"; import TurndownService from "turndown"; +import utf8 from "utf8"; import uuid from "uuid"; import parseTitle from "../../shared/utils/parseTitle"; -import { InvalidRequestError } from "../errors"; +import { FileImportError, InvalidRequestError } from "../errors"; import { Attachment, Event, User } from "../models"; import dataURItoBuffer from "../utils/dataURItoBuffer"; import parseImages from "../utils/parseImages"; @@ -18,12 +21,28 @@ const turndownService = new TurndownService({ headingStyle: "atx", }); +// Use the GitHub-flavored markdown plugin to parse +// strikethoughs and tables +turndownService + .use(strikethrough) + .use(tables) + .addRule("breaks", { + filter: ["br"], + replacement: function (content) { + return "\n"; + }, + }); + interface ImportableFile { type: string; getMarkdown: (file: any) => Promise; } const importMapping: ImportableFile[] = [ + { + type: "application/msword", + getMarkdown: confluenceToMarkdown, + }, { type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -57,6 +76,60 @@ async function htmlToMarkdown(file): Promise { return turndownService.turndown(value); } +async function confluenceToMarkdown(file): Promise { + let value = await fs.promises.readFile(file.path, "utf8"); + + // We're only supporting the ridiculous output from Confluence here, regular + // Word documents should call into the docxToMarkdown importer. + // See: https://jira.atlassian.com/browse/CONFSERVER-38237 + if (!value.includes("Content-Type: multipart/related")) { + throw new FileImportError("Unsupported Word file"); + } + + // get boundary marker + const boundaryMarker = value.match(/boundary="(.+)"/); + if (!boundaryMarker) { + throw new FileImportError("Unsupported Word file (No boundary marker)"); + } + + // get content between multipart boundaries + let boundaryReached = 0; + const lines = value.split("\n").filter((line) => { + if (line.includes(boundaryMarker[1])) { + boundaryReached++; + return false; + } + if (line.startsWith("Content-")) { + return false; + } + + // 1 == definition + // 2 == content + // 3 == ending + if (boundaryReached === 2) { + return true; + } + return false; + }); + + if (!lines.length) { + throw new FileImportError("Unsupported Word file (No content found)"); + } + + // Mime attachment is "quoted printable" encoded, must be decoded first + // https://en.wikipedia.org/wiki/Quoted-printable + value = utf8.decode(quotedPrintable.decode(lines.join("\n"))); + + // If we don't remove the title here it becomes printed in the document + // body by turndown + turndownService.remove(["style", "xml", "title"]); + + // Now we should have something that looks like HTML + const html = turndownService.turndown(value); + + return html.replace(/
/g, " \\n "); +} + export default async function documentImporter({ file, user, diff --git a/server/commands/documentImporter.test.js b/server/commands/documentImporter.test.js index 66714eee..8acb1221 100644 --- a/server/commands/documentImporter.test.js +++ b/server/commands/documentImporter.test.js @@ -56,6 +56,25 @@ describe("documentImporter", () => { expect(response.title).toEqual("Heading 1"); }); + it("should convert Confluence Word output to markdown", async () => { + const user = await buildUser(); + const name = "confluence.doc"; + const file = new File({ + name, + type: "application/msword", + path: path.resolve(__dirname, "..", "test", "fixtures", name), + }); + + const response = await documentImporter({ + user, + file, + ip, + }); + + expect(response.text).toContain("this is a test document"); + expect(response.title).toEqual("Heading 1"); + }); + it("should load markdown", async () => { const user = await buildUser(); const name = "markdown.md"; diff --git a/server/errors.js b/server/errors.js index 4e72e981..2ae4b9ba 100644 --- a/server/errors.js +++ b/server/errors.js @@ -51,3 +51,9 @@ export function EditorUpdateError( ) { return httpErrors(400, message, { id: "editor_update_required" }); } + +export function FileImportError( + message: string = "The file could not be imported" +) { + return httpErrors(400, message, { id: "import_error" }); +} diff --git a/server/test/fixtures/confluence.doc b/server/test/fixtures/confluence.doc new file mode 100644 index 00000000..cc44ce51 --- /dev/null +++ b/server/test/fixtures/confluence.doc @@ -0,0 +1,242 @@ +Date: Thu, 15 Oct 2020 21:46:24 +0000 (UTC) +Message-ID: <1157346768.61.1602798384354@1c1c2e46a009> +Subject: Exported From Confluence +MIME-Version: 1.0 +Content-Type: multipart/related; + boundary="----=_Part_60_1619701709.1602798384353" + +------=_Part_60_1619701709.1602798384353 +Content-Type: text/html; charset=UTF-8 +Content-Transfer-Encoding: quoted-printable +Content-Location: file:///C:/exported.html + + + + + Heading 1 + + + + +

Heading 1

+
+

+

can=E2=80=99t this is a test document ge= +ts updated/reverted you can=E2=80=99t change the branch test, create a new = +one

+
+ + +------=_Part_60_1619701709.1602798384353-- diff --git a/server/test/fixtures/tables.html b/server/test/fixtures/tables.html new file mode 100644 index 00000000..600c84ae --- /dev/null +++ b/server/test/fixtures/tables.html @@ -0,0 +1,26 @@ + + + +

Heading 1

+ + + + + + + + + + + + + + + + + + +
Col 1Col 2
Col 1.1Col 2.1
Col 1.2Col 2.2
+ + + \ No newline at end of file diff --git a/yarn.lock b/yarn.lock index 840aa3ed..abf9a13a 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6699,6 +6699,11 @@ jmespath@0.15.0: resolved "https://registry.yarnpkg.com/jmespath/-/jmespath-0.15.0.tgz#a3f222a9aae9f966f5d27c796510e28091764217" integrity sha1-o/Iiqarp+Wb10nx5ZRDigJF2Qhc= +joplin-turndown-plugin-gfm@^1.0.12: + version "1.0.12" + resolved "https://registry.yarnpkg.com/joplin-turndown-plugin-gfm/-/joplin-turndown-plugin-gfm-1.0.12.tgz#f0774183177895c6fedeec951053cab6046dede8" + integrity sha512-qL4+1iycQjZ1fs8zk3jSRk7cg3ROBUHk7GKtiLAQLFzLPKErnILUvz5DLszSQvz3s1sTjPbywLDISVUtBY6HaA== + js-beautify@^1.8.8: version "1.11.0" resolved "https://registry.yarnpkg.com/js-beautify/-/js-beautify-1.11.0.tgz#afb873dc47d58986360093dcb69951e8bcd5ded2" @@ -9270,6 +9275,13 @@ querystring@0.2.0, querystring@^0.2.0: resolved "https://registry.yarnpkg.com/querystring/-/querystring-0.2.0.tgz#b209849203bb25df820da756e747005878521620" integrity sha1-sgmEkgO7Jd+CDadW50cAWHhSFiA= +quoted-printable@^1.0.1: + version "1.0.1" + resolved "https://registry.yarnpkg.com/quoted-printable/-/quoted-printable-1.0.1.tgz#9eebf5eb3d11eef022b264fd2d2b6b2bb3b84cc3" + integrity sha1-nuv16z0R7vAismT9LStrK7O4TMM= + dependencies: + utf8 "^2.1.0" + randombytes@^2.0.0, randombytes@^2.0.1, randombytes@^2.0.5, randombytes@^2.1.0: version "2.1.0" resolved "https://registry.yarnpkg.com/randombytes/-/randombytes-2.1.0.tgz#df6f84372f0270dc65cdf6291349ab7a473d4f2a" @@ -11619,6 +11631,11 @@ use@^3.1.0: resolved "https://registry.yarnpkg.com/use/-/use-3.1.1.tgz#d50c8cac79a19fbc20f2911f56eb973f4e10070f" integrity sha512-cwESVXlO3url9YWlFW/TA9cshCEhtu7IKJ/p5soJ/gGpj7vbvFrAY/eIioQ6Dw23KjZhYgiIo8HOs1nQ2vr/oQ== +utf8@^2.1.0: + version "2.1.2" + resolved "https://registry.yarnpkg.com/utf8/-/utf8-2.1.2.tgz#1fa0d9270e9be850d9b05027f63519bf46457d96" + integrity sha1-H6DZJw6b6FDZsFAn9jUZv0ZFfZY= + util-deprecate@^1.0.1, util-deprecate@~1.0.1: version "1.0.2" resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf"