feat: Add support for "word" files exported from Confluence (#1600)

* Display error message to end user

* fix: Improve conversion of tables

* fix: Characters at ends of lines in tables lost
This commit is contained in:
Tom Moor 2020-10-21 08:53:59 -07:00 committed by GitHub
parent b78e2f1e05
commit df7b9f3e88
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 394 additions and 3 deletions

View File

@ -7,6 +7,7 @@ import Dropzone from "react-dropzone";
import { withRouter, type RouterHistory, type Match } from "react-router-dom";
import { createGlobalStyle } from "styled-components";
import DocumentsStore from "stores/DocumentsStore";
import UiStore from "stores/UiStore";
import LoadingIndicator from "components/LoadingIndicator";
const EMPTY_OBJECT = {};
@ -18,6 +19,7 @@ type Props = {
documentId?: string,
activeClassName?: string,
rejectClassName?: string,
ui: UiStore,
documents: DocumentsStore,
disabled: boolean,
location: Object,
@ -71,6 +73,8 @@ class DropToImport extends React.Component<Props> {
this.props.history.push(doc.url);
}
}
} catch (err) {
this.props.ui.showToast(`Could not import file. ${err.message}`);
} finally {
this.isImporting = false;
importingLock = false;
@ -109,4 +113,4 @@ class DropToImport extends React.Component<Props> {
}
}
export default inject("documents")(withRouter(DropToImport));
export default inject("documents", "ui")(withRouter(DropToImport));

View File

@ -22,6 +22,7 @@ export default class DocumentsStore extends BaseStore<Document> {
"text/markdown",
"text/plain",
"text/html",
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
];

View File

@ -97,6 +97,7 @@
"invariant": "^2.2.2",
"ioredis": "^4.14.1",
"isomorphic-fetch": "2.2.1",
"joplin-turndown-plugin-gfm": "^1.0.12",
"js-search": "^1.4.2",
"json-loader": "0.5.4",
"jsonwebtoken": "^8.5.0",
@ -126,6 +127,7 @@
"pg-hstore": "^2.3.3",
"polished": "3.6.5",
"query-string": "^4.3.4",
"quoted-printable": "^1.0.1",
"randomstring": "1.1.5",
"raw-loader": "^0.5.1",
"react": "^16.8.6",
@ -158,6 +160,7 @@
"tiny-cookie": "^2.3.1",
"tmp": "0.0.33",
"turndown": "^6.0.0",
"utf8": "^2.1.0",
"uuid": "2.0.2",
"validator": "5.2.0"
},
@ -195,4 +198,4 @@
"js-yaml": "^3.13.1"
},
"version": "0.48.1"
}
}

View File

@ -1,11 +1,14 @@
// @flow
import fs from "fs";
import File from "formidable/lib/file";
import { strikethrough, tables } from "joplin-turndown-plugin-gfm";
import mammoth from "mammoth";
import quotedPrintable from "quoted-printable";
import TurndownService from "turndown";
import utf8 from "utf8";
import uuid from "uuid";
import parseTitle from "../../shared/utils/parseTitle";
import { InvalidRequestError } from "../errors";
import { FileImportError, InvalidRequestError } from "../errors";
import { Attachment, Event, User } from "../models";
import dataURItoBuffer from "../utils/dataURItoBuffer";
import parseImages from "../utils/parseImages";
@ -18,12 +21,28 @@ const turndownService = new TurndownService({
headingStyle: "atx",
});
// Use the GitHub-flavored markdown plugin to parse
// strikethoughs and tables
turndownService
.use(strikethrough)
.use(tables)
.addRule("breaks", {
filter: ["br"],
replacement: function (content) {
return "\n";
},
});
interface ImportableFile {
type: string;
getMarkdown: (file: any) => Promise<string>;
}
const importMapping: ImportableFile[] = [
{
type: "application/msword",
getMarkdown: confluenceToMarkdown,
},
{
type:
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -57,6 +76,60 @@ async function htmlToMarkdown(file): Promise<string> {
return turndownService.turndown(value);
}
async function confluenceToMarkdown(file): Promise<string> {
let value = await fs.promises.readFile(file.path, "utf8");
// We're only supporting the ridiculous output from Confluence here, regular
// Word documents should call into the docxToMarkdown importer.
// See: https://jira.atlassian.com/browse/CONFSERVER-38237
if (!value.includes("Content-Type: multipart/related")) {
throw new FileImportError("Unsupported Word file");
}
// get boundary marker
const boundaryMarker = value.match(/boundary="(.+)"/);
if (!boundaryMarker) {
throw new FileImportError("Unsupported Word file (No boundary marker)");
}
// get content between multipart boundaries
let boundaryReached = 0;
const lines = value.split("\n").filter((line) => {
if (line.includes(boundaryMarker[1])) {
boundaryReached++;
return false;
}
if (line.startsWith("Content-")) {
return false;
}
// 1 == definition
// 2 == content
// 3 == ending
if (boundaryReached === 2) {
return true;
}
return false;
});
if (!lines.length) {
throw new FileImportError("Unsupported Word file (No content found)");
}
// Mime attachment is "quoted printable" encoded, must be decoded first
// https://en.wikipedia.org/wiki/Quoted-printable
value = utf8.decode(quotedPrintable.decode(lines.join("\n")));
// If we don't remove the title here it becomes printed in the document
// body by turndown
turndownService.remove(["style", "xml", "title"]);
// Now we should have something that looks like HTML
const html = turndownService.turndown(value);
return html.replace(/<br>/g, " \\n ");
}
export default async function documentImporter({
file,
user,

View File

@ -56,6 +56,25 @@ describe("documentImporter", () => {
expect(response.title).toEqual("Heading 1");
});
it("should convert Confluence Word output to markdown", async () => {
const user = await buildUser();
const name = "confluence.doc";
const file = new File({
name,
type: "application/msword",
path: path.resolve(__dirname, "..", "test", "fixtures", name),
});
const response = await documentImporter({
user,
file,
ip,
});
expect(response.text).toContain("this is a test document");
expect(response.title).toEqual("Heading 1");
});
it("should load markdown", async () => {
const user = await buildUser();
const name = "markdown.md";

View File

@ -51,3 +51,9 @@ export function EditorUpdateError(
) {
return httpErrors(400, message, { id: "editor_update_required" });
}
export function FileImportError(
message: string = "The file could not be imported"
) {
return httpErrors(400, message, { id: "import_error" });
}

242
server/test/fixtures/confluence.doc vendored Normal file
View File

@ -0,0 +1,242 @@
Date: Thu, 15 Oct 2020 21:46:24 +0000 (UTC)
Message-ID: <1157346768.61.1602798384354@1c1c2e46a009>
Subject: Exported From Confluence
MIME-Version: 1.0
Content-Type: multipart/related;
boundary="----=_Part_60_1619701709.1602798384353"
------=_Part_60_1619701709.1602798384353
Content-Type: text/html; charset=UTF-8
Content-Transfer-Encoding: quoted-printable
Content-Location: file:///C:/exported.html
<html xmlns:o=3D'urn:schemas-microsoft-com:office:office'
xmlns:w=3D'urn:schemas-microsoft-com:office:word'
xmlns:v=3D'urn:schemas-microsoft-com:vml'
xmlns=3D'urn:w3-org-ns:HTML'>
<head>
<meta http-equiv=3D"Content-Type" content=3D"text/html; charset=3Dutf-8=
">
<title>Heading 1</title>
<!--[if gte mso 9]>
<xml>
<o:OfficeDocumentSettings>
<o:TargetScreenSize>1024x640</o:TargetScreenSize>
<o:PixelsPerInch>72</o:PixelsPerInch>
<o:AllowPNG/>
</o:OfficeDocumentSettings>
<w:WordDocument>
<w:View>Print</w:View>
<w:Zoom>90</w:Zoom>
<w:DoNotOptimizeForBrowser/>
</w:WordDocument>
</xml>
<![endif]-->
<style>
<!--
@page Section1 {
size: 8.5in 11.0in;
margin: 1.0in;
mso-header-margin: .5in;
mso-footer-margin: .5in;
mso-paper-source: 0;
}
table {
border: solid 1px;
border-collapse: collapse;
}
table td, table th {
border: solid 1px;
padding: 5px;
}
td {
page-break-inside: avoid;
}
tr {
page-break-after: avoid;
}
div.Section1 {
page: Section1;
}
/* Confluence print stylesheet. Common to all themes for print medi=
a */
/* Full of !important until we improve batching for print CSS */
@media print {
#main {
padding-bottom: 1em !important; /* The default padding of 6em is to=
o much for printouts */
}
body {
font-family: Arial, Helvetica, FreeSans, sans-serif;
font-size: 10pt;
line-height: 1.2;
}
body, #full-height-container, #main, #page, #content, .has-personal-sid=
ebar #content {
background: #fff !important;
color: #000 !important;
border: 0 !important;
width: 100% !important;
height: auto !important;
min-height: auto !important;
margin: 0 !important;
padding: 0 !important;
display: block !important;
}
a, a:link, a:visited, a:focus, a:hover, a:active {
color: #000;
}
#content h1,
#content h2,
#content h3,
#content h4,
#content h5,
#content h6 {
font-family: Arial, Helvetica, FreeSans, sans-serif;
page-break-after: avoid;
}
pre {
font-family: Monaco, "Courier New", monospace;
}
#header,
.aui-header-inner,
#navigation,
#sidebar,
.sidebar,
#personal-info-sidebar,
.ia-fixed-sidebar,
.page-actions,
.navmenu,
.ajs-menu-bar,
.noprint,
.inline-control-link,
.inline-control-link a,
a.show-labels-editor,
.global-comment-actions,
.comment-actions,
.quick-comment-container,
#addcomment {
display: none !important;
}
/* CONF-28544 cannot print multiple pages in IE */
#splitter-content {
position: relative !important;
}
.comment .date::before {
content: none !important; /* remove middot for print view */
}
h1.pagetitle img {
height: auto;
width: auto;
}
.print-only {
display: block;
}
#footer {
position: relative !important; /* CONF-17506 Place the footer at en=
d of the content */
margin: 0;
padding: 0;
background: none;
clear: both;
}
#poweredby {
border-top: none;
background: none;
}
#poweredby li.print-only {
display: list-item;
font-style: italic;
}
#poweredby li.noprint {
display: none;
}
/* no width controls in print */
.wiki-content .table-wrap,
.wiki-content p,
.panel .codeContent,
.panel .codeContent pre,
.image-wrap {
overflow: visible !important;
}
/* TODO - should this work? */
#children-section,
#comments-section .comment,
#comments-section .comment .comment-body,
#comments-section .comment .comment-content,
#comments-section .comment p {
page-break-inside: avoid;
}
#page-children a {
text-decoration: none;
}
/**
hide twixies
the specificity here is a hack because print styles
are getting loaded before the base styles. */
#comments-section.pageSection .section-header,
#comments-section.pageSection .section-title,
#children-section.pageSection .section-header,
#children-section.pageSection .section-title,
.children-show-hide {
padding-left: 0;
margin-left: 0;
}
.children-show-hide.icon {
display: none;
}
/* personal sidebar */
.has-personal-sidebar #content {
margin-right: 0px;
}
.has-personal-sidebar #content .pageSection {
margin-right: 0px;
}
.no-print, .no-print * {
display: none !important;
}
}
-->
</style>
</head>
<body>
<h1>Heading 1</h1>
<div class=3D"Section1">
<p></p>
<p>can=E2=80=99t this is a test document ge=
ts updated/reverted you can=E2=80=99t change the branch test, create a new =
one</p>
</div>
</body>
</html>
------=_Part_60_1619701709.1602798384353--

26
server/test/fixtures/tables.html vendored Normal file
View File

@ -0,0 +1,26 @@
<html>
<body>
<h1>Heading 1</h1>
<table>
<thead>
<tr>
<th>Col 1</th>
<th>Col 2</th>
</tr>
</thead>
<tbody>
<tr>
<td>Col 1.1</td>
<td>Col 2.1</td>
</tr>
<tr>
<td>Col 1.2</td>
<td>Col 2.2</td>
</tr>
</tbody>
</table>
</body>
</html>

View File

@ -6699,6 +6699,11 @@ jmespath@0.15.0:
resolved "https://registry.yarnpkg.com/jmespath/-/jmespath-0.15.0.tgz#a3f222a9aae9f966f5d27c796510e28091764217"
integrity sha1-o/Iiqarp+Wb10nx5ZRDigJF2Qhc=
joplin-turndown-plugin-gfm@^1.0.12:
version "1.0.12"
resolved "https://registry.yarnpkg.com/joplin-turndown-plugin-gfm/-/joplin-turndown-plugin-gfm-1.0.12.tgz#f0774183177895c6fedeec951053cab6046dede8"
integrity sha512-qL4+1iycQjZ1fs8zk3jSRk7cg3ROBUHk7GKtiLAQLFzLPKErnILUvz5DLszSQvz3s1sTjPbywLDISVUtBY6HaA==
js-beautify@^1.8.8:
version "1.11.0"
resolved "https://registry.yarnpkg.com/js-beautify/-/js-beautify-1.11.0.tgz#afb873dc47d58986360093dcb69951e8bcd5ded2"
@ -9270,6 +9275,13 @@ querystring@0.2.0, querystring@^0.2.0:
resolved "https://registry.yarnpkg.com/querystring/-/querystring-0.2.0.tgz#b209849203bb25df820da756e747005878521620"
integrity sha1-sgmEkgO7Jd+CDadW50cAWHhSFiA=
quoted-printable@^1.0.1:
version "1.0.1"
resolved "https://registry.yarnpkg.com/quoted-printable/-/quoted-printable-1.0.1.tgz#9eebf5eb3d11eef022b264fd2d2b6b2bb3b84cc3"
integrity sha1-nuv16z0R7vAismT9LStrK7O4TMM=
dependencies:
utf8 "^2.1.0"
randombytes@^2.0.0, randombytes@^2.0.1, randombytes@^2.0.5, randombytes@^2.1.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/randombytes/-/randombytes-2.1.0.tgz#df6f84372f0270dc65cdf6291349ab7a473d4f2a"
@ -11619,6 +11631,11 @@ use@^3.1.0:
resolved "https://registry.yarnpkg.com/use/-/use-3.1.1.tgz#d50c8cac79a19fbc20f2911f56eb973f4e10070f"
integrity sha512-cwESVXlO3url9YWlFW/TA9cshCEhtu7IKJ/p5soJ/gGpj7vbvFrAY/eIioQ6Dw23KjZhYgiIo8HOs1nQ2vr/oQ==
utf8@^2.1.0:
version "2.1.2"
resolved "https://registry.yarnpkg.com/utf8/-/utf8-2.1.2.tgz#1fa0d9270e9be850d9b05027f63519bf46457d96"
integrity sha1-H6DZJw6b6FDZsFAn9jUZv0ZFfZY=
util-deprecate@^1.0.1, util-deprecate@~1.0.1:
version "1.0.2"
resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf"