feat: Add support for "word" files exported from Confluence (#1600)
* Display error message to end user * fix: Improve conversion of tables * fix: Characters at ends of lines in tables lost
This commit is contained in:
parent
b78e2f1e05
commit
df7b9f3e88
|
@ -7,6 +7,7 @@ import Dropzone from "react-dropzone";
|
|||
import { withRouter, type RouterHistory, type Match } from "react-router-dom";
|
||||
import { createGlobalStyle } from "styled-components";
|
||||
import DocumentsStore from "stores/DocumentsStore";
|
||||
import UiStore from "stores/UiStore";
|
||||
import LoadingIndicator from "components/LoadingIndicator";
|
||||
|
||||
const EMPTY_OBJECT = {};
|
||||
|
@ -18,6 +19,7 @@ type Props = {
|
|||
documentId?: string,
|
||||
activeClassName?: string,
|
||||
rejectClassName?: string,
|
||||
ui: UiStore,
|
||||
documents: DocumentsStore,
|
||||
disabled: boolean,
|
||||
location: Object,
|
||||
|
@ -71,6 +73,8 @@ class DropToImport extends React.Component<Props> {
|
|||
this.props.history.push(doc.url);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
this.props.ui.showToast(`Could not import file. ${err.message}`);
|
||||
} finally {
|
||||
this.isImporting = false;
|
||||
importingLock = false;
|
||||
|
@ -109,4 +113,4 @@ class DropToImport extends React.Component<Props> {
|
|||
}
|
||||
}
|
||||
|
||||
export default inject("documents")(withRouter(DropToImport));
|
||||
export default inject("documents", "ui")(withRouter(DropToImport));
|
||||
|
|
|
@ -22,6 +22,7 @@ export default class DocumentsStore extends BaseStore<Document> {
|
|||
"text/markdown",
|
||||
"text/plain",
|
||||
"text/html",
|
||||
"application/msword",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
];
|
||||
|
||||
|
|
|
@ -97,6 +97,7 @@
|
|||
"invariant": "^2.2.2",
|
||||
"ioredis": "^4.14.1",
|
||||
"isomorphic-fetch": "2.2.1",
|
||||
"joplin-turndown-plugin-gfm": "^1.0.12",
|
||||
"js-search": "^1.4.2",
|
||||
"json-loader": "0.5.4",
|
||||
"jsonwebtoken": "^8.5.0",
|
||||
|
@ -126,6 +127,7 @@
|
|||
"pg-hstore": "^2.3.3",
|
||||
"polished": "3.6.5",
|
||||
"query-string": "^4.3.4",
|
||||
"quoted-printable": "^1.0.1",
|
||||
"randomstring": "1.1.5",
|
||||
"raw-loader": "^0.5.1",
|
||||
"react": "^16.8.6",
|
||||
|
@ -158,6 +160,7 @@
|
|||
"tiny-cookie": "^2.3.1",
|
||||
"tmp": "0.0.33",
|
||||
"turndown": "^6.0.0",
|
||||
"utf8": "^2.1.0",
|
||||
"uuid": "2.0.2",
|
||||
"validator": "5.2.0"
|
||||
},
|
||||
|
@ -195,4 +198,4 @@
|
|||
"js-yaml": "^3.13.1"
|
||||
},
|
||||
"version": "0.48.1"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
// @flow
|
||||
import fs from "fs";
|
||||
import File from "formidable/lib/file";
|
||||
import { strikethrough, tables } from "joplin-turndown-plugin-gfm";
|
||||
import mammoth from "mammoth";
|
||||
import quotedPrintable from "quoted-printable";
|
||||
import TurndownService from "turndown";
|
||||
import utf8 from "utf8";
|
||||
import uuid from "uuid";
|
||||
import parseTitle from "../../shared/utils/parseTitle";
|
||||
import { InvalidRequestError } from "../errors";
|
||||
import { FileImportError, InvalidRequestError } from "../errors";
|
||||
import { Attachment, Event, User } from "../models";
|
||||
import dataURItoBuffer from "../utils/dataURItoBuffer";
|
||||
import parseImages from "../utils/parseImages";
|
||||
|
@ -18,12 +21,28 @@ const turndownService = new TurndownService({
|
|||
headingStyle: "atx",
|
||||
});
|
||||
|
||||
// Use the GitHub-flavored markdown plugin to parse
|
||||
// strikethoughs and tables
|
||||
turndownService
|
||||
.use(strikethrough)
|
||||
.use(tables)
|
||||
.addRule("breaks", {
|
||||
filter: ["br"],
|
||||
replacement: function (content) {
|
||||
return "\n";
|
||||
},
|
||||
});
|
||||
|
||||
interface ImportableFile {
|
||||
type: string;
|
||||
getMarkdown: (file: any) => Promise<string>;
|
||||
}
|
||||
|
||||
const importMapping: ImportableFile[] = [
|
||||
{
|
||||
type: "application/msword",
|
||||
getMarkdown: confluenceToMarkdown,
|
||||
},
|
||||
{
|
||||
type:
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
|
@ -57,6 +76,60 @@ async function htmlToMarkdown(file): Promise<string> {
|
|||
return turndownService.turndown(value);
|
||||
}
|
||||
|
||||
async function confluenceToMarkdown(file): Promise<string> {
|
||||
let value = await fs.promises.readFile(file.path, "utf8");
|
||||
|
||||
// We're only supporting the ridiculous output from Confluence here, regular
|
||||
// Word documents should call into the docxToMarkdown importer.
|
||||
// See: https://jira.atlassian.com/browse/CONFSERVER-38237
|
||||
if (!value.includes("Content-Type: multipart/related")) {
|
||||
throw new FileImportError("Unsupported Word file");
|
||||
}
|
||||
|
||||
// get boundary marker
|
||||
const boundaryMarker = value.match(/boundary="(.+)"/);
|
||||
if (!boundaryMarker) {
|
||||
throw new FileImportError("Unsupported Word file (No boundary marker)");
|
||||
}
|
||||
|
||||
// get content between multipart boundaries
|
||||
let boundaryReached = 0;
|
||||
const lines = value.split("\n").filter((line) => {
|
||||
if (line.includes(boundaryMarker[1])) {
|
||||
boundaryReached++;
|
||||
return false;
|
||||
}
|
||||
if (line.startsWith("Content-")) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 1 == definition
|
||||
// 2 == content
|
||||
// 3 == ending
|
||||
if (boundaryReached === 2) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
if (!lines.length) {
|
||||
throw new FileImportError("Unsupported Word file (No content found)");
|
||||
}
|
||||
|
||||
// Mime attachment is "quoted printable" encoded, must be decoded first
|
||||
// https://en.wikipedia.org/wiki/Quoted-printable
|
||||
value = utf8.decode(quotedPrintable.decode(lines.join("\n")));
|
||||
|
||||
// If we don't remove the title here it becomes printed in the document
|
||||
// body by turndown
|
||||
turndownService.remove(["style", "xml", "title"]);
|
||||
|
||||
// Now we should have something that looks like HTML
|
||||
const html = turndownService.turndown(value);
|
||||
|
||||
return html.replace(/<br>/g, " \\n ");
|
||||
}
|
||||
|
||||
export default async function documentImporter({
|
||||
file,
|
||||
user,
|
||||
|
|
|
@ -56,6 +56,25 @@ describe("documentImporter", () => {
|
|||
expect(response.title).toEqual("Heading 1");
|
||||
});
|
||||
|
||||
it("should convert Confluence Word output to markdown", async () => {
|
||||
const user = await buildUser();
|
||||
const name = "confluence.doc";
|
||||
const file = new File({
|
||||
name,
|
||||
type: "application/msword",
|
||||
path: path.resolve(__dirname, "..", "test", "fixtures", name),
|
||||
});
|
||||
|
||||
const response = await documentImporter({
|
||||
user,
|
||||
file,
|
||||
ip,
|
||||
});
|
||||
|
||||
expect(response.text).toContain("this is a test document");
|
||||
expect(response.title).toEqual("Heading 1");
|
||||
});
|
||||
|
||||
it("should load markdown", async () => {
|
||||
const user = await buildUser();
|
||||
const name = "markdown.md";
|
||||
|
|
|
@ -51,3 +51,9 @@ export function EditorUpdateError(
|
|||
) {
|
||||
return httpErrors(400, message, { id: "editor_update_required" });
|
||||
}
|
||||
|
||||
export function FileImportError(
|
||||
message: string = "The file could not be imported"
|
||||
) {
|
||||
return httpErrors(400, message, { id: "import_error" });
|
||||
}
|
||||
|
|
|
@ -0,0 +1,242 @@
|
|||
Date: Thu, 15 Oct 2020 21:46:24 +0000 (UTC)
|
||||
Message-ID: <1157346768.61.1602798384354@1c1c2e46a009>
|
||||
Subject: Exported From Confluence
|
||||
MIME-Version: 1.0
|
||||
Content-Type: multipart/related;
|
||||
boundary="----=_Part_60_1619701709.1602798384353"
|
||||
|
||||
------=_Part_60_1619701709.1602798384353
|
||||
Content-Type: text/html; charset=UTF-8
|
||||
Content-Transfer-Encoding: quoted-printable
|
||||
Content-Location: file:///C:/exported.html
|
||||
|
||||
<html xmlns:o=3D'urn:schemas-microsoft-com:office:office'
|
||||
xmlns:w=3D'urn:schemas-microsoft-com:office:word'
|
||||
xmlns:v=3D'urn:schemas-microsoft-com:vml'
|
||||
xmlns=3D'urn:w3-org-ns:HTML'>
|
||||
<head>
|
||||
<meta http-equiv=3D"Content-Type" content=3D"text/html; charset=3Dutf-8=
|
||||
">
|
||||
<title>Heading 1</title>
|
||||
<!--[if gte mso 9]>
|
||||
<xml>
|
||||
<o:OfficeDocumentSettings>
|
||||
<o:TargetScreenSize>1024x640</o:TargetScreenSize>
|
||||
<o:PixelsPerInch>72</o:PixelsPerInch>
|
||||
<o:AllowPNG/>
|
||||
</o:OfficeDocumentSettings>
|
||||
<w:WordDocument>
|
||||
<w:View>Print</w:View>
|
||||
<w:Zoom>90</w:Zoom>
|
||||
<w:DoNotOptimizeForBrowser/>
|
||||
</w:WordDocument>
|
||||
</xml>
|
||||
<![endif]-->
|
||||
<style>
|
||||
<!--
|
||||
@page Section1 {
|
||||
size: 8.5in 11.0in;
|
||||
margin: 1.0in;
|
||||
mso-header-margin: .5in;
|
||||
mso-footer-margin: .5in;
|
||||
mso-paper-source: 0;
|
||||
}
|
||||
|
||||
table {
|
||||
border: solid 1px;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
|
||||
table td, table th {
|
||||
border: solid 1px;
|
||||
padding: 5px;
|
||||
}
|
||||
|
||||
td {
|
||||
page-break-inside: avoid;
|
||||
}
|
||||
|
||||
tr {
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
div.Section1 {
|
||||
page: Section1;
|
||||
}
|
||||
|
||||
/* Confluence print stylesheet. Common to all themes for print medi=
|
||||
a */
|
||||
/* Full of !important until we improve batching for print CSS */
|
||||
|
||||
@media print {
|
||||
#main {
|
||||
padding-bottom: 1em !important; /* The default padding of 6em is to=
|
||||
o much for printouts */
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: Arial, Helvetica, FreeSans, sans-serif;
|
||||
font-size: 10pt;
|
||||
line-height: 1.2;
|
||||
}
|
||||
|
||||
body, #full-height-container, #main, #page, #content, .has-personal-sid=
|
||||
ebar #content {
|
||||
background: #fff !important;
|
||||
color: #000 !important;
|
||||
border: 0 !important;
|
||||
width: 100% !important;
|
||||
height: auto !important;
|
||||
min-height: auto !important;
|
||||
margin: 0 !important;
|
||||
padding: 0 !important;
|
||||
display: block !important;
|
||||
}
|
||||
|
||||
a, a:link, a:visited, a:focus, a:hover, a:active {
|
||||
color: #000;
|
||||
}
|
||||
|
||||
#content h1,
|
||||
#content h2,
|
||||
#content h3,
|
||||
#content h4,
|
||||
#content h5,
|
||||
#content h6 {
|
||||
font-family: Arial, Helvetica, FreeSans, sans-serif;
|
||||
page-break-after: avoid;
|
||||
}
|
||||
|
||||
pre {
|
||||
font-family: Monaco, "Courier New", monospace;
|
||||
}
|
||||
|
||||
#header,
|
||||
.aui-header-inner,
|
||||
#navigation,
|
||||
#sidebar,
|
||||
.sidebar,
|
||||
#personal-info-sidebar,
|
||||
.ia-fixed-sidebar,
|
||||
.page-actions,
|
||||
.navmenu,
|
||||
.ajs-menu-bar,
|
||||
.noprint,
|
||||
.inline-control-link,
|
||||
.inline-control-link a,
|
||||
a.show-labels-editor,
|
||||
.global-comment-actions,
|
||||
.comment-actions,
|
||||
.quick-comment-container,
|
||||
#addcomment {
|
||||
display: none !important;
|
||||
}
|
||||
|
||||
/* CONF-28544 cannot print multiple pages in IE */
|
||||
#splitter-content {
|
||||
position: relative !important;
|
||||
}
|
||||
|
||||
.comment .date::before {
|
||||
content: none !important; /* remove middot for print view */
|
||||
}
|
||||
|
||||
h1.pagetitle img {
|
||||
height: auto;
|
||||
width: auto;
|
||||
}
|
||||
|
||||
.print-only {
|
||||
display: block;
|
||||
}
|
||||
|
||||
#footer {
|
||||
position: relative !important; /* CONF-17506 Place the footer at en=
|
||||
d of the content */
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
background: none;
|
||||
clear: both;
|
||||
}
|
||||
|
||||
#poweredby {
|
||||
border-top: none;
|
||||
background: none;
|
||||
}
|
||||
|
||||
#poweredby li.print-only {
|
||||
display: list-item;
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
#poweredby li.noprint {
|
||||
display: none;
|
||||
}
|
||||
|
||||
/* no width controls in print */
|
||||
.wiki-content .table-wrap,
|
||||
.wiki-content p,
|
||||
.panel .codeContent,
|
||||
.panel .codeContent pre,
|
||||
.image-wrap {
|
||||
overflow: visible !important;
|
||||
}
|
||||
|
||||
/* TODO - should this work? */
|
||||
#children-section,
|
||||
#comments-section .comment,
|
||||
#comments-section .comment .comment-body,
|
||||
#comments-section .comment .comment-content,
|
||||
#comments-section .comment p {
|
||||
page-break-inside: avoid;
|
||||
}
|
||||
|
||||
#page-children a {
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
/**
|
||||
hide twixies
|
||||
|
||||
the specificity here is a hack because print styles
|
||||
are getting loaded before the base styles. */
|
||||
#comments-section.pageSection .section-header,
|
||||
#comments-section.pageSection .section-title,
|
||||
#children-section.pageSection .section-header,
|
||||
#children-section.pageSection .section-title,
|
||||
.children-show-hide {
|
||||
padding-left: 0;
|
||||
margin-left: 0;
|
||||
}
|
||||
|
||||
.children-show-hide.icon {
|
||||
display: none;
|
||||
}
|
||||
|
||||
/* personal sidebar */
|
||||
.has-personal-sidebar #content {
|
||||
margin-right: 0px;
|
||||
}
|
||||
|
||||
.has-personal-sidebar #content .pageSection {
|
||||
margin-right: 0px;
|
||||
}
|
||||
|
||||
.no-print, .no-print * {
|
||||
display: none !important;
|
||||
}
|
||||
}
|
||||
-->
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Heading 1</h1>
|
||||
<div class=3D"Section1">
|
||||
<p></p>
|
||||
<p>can=E2=80=99t this is a test document ge=
|
||||
ts updated/reverted you can=E2=80=99t change the branch test, create a new =
|
||||
one</p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
------=_Part_60_1619701709.1602798384353--
|
|
@ -0,0 +1,26 @@
|
|||
<html>
|
||||
|
||||
<body>
|
||||
<h1>Heading 1</h1>
|
||||
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Col 1</th>
|
||||
<th>Col 2</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Col 1.1</td>
|
||||
<td>Col 2.1</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Col 1.2</td>
|
||||
<td>Col 2.2</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</body>
|
||||
|
||||
</html>
|
17
yarn.lock
17
yarn.lock
|
@ -6699,6 +6699,11 @@ jmespath@0.15.0:
|
|||
resolved "https://registry.yarnpkg.com/jmespath/-/jmespath-0.15.0.tgz#a3f222a9aae9f966f5d27c796510e28091764217"
|
||||
integrity sha1-o/Iiqarp+Wb10nx5ZRDigJF2Qhc=
|
||||
|
||||
joplin-turndown-plugin-gfm@^1.0.12:
|
||||
version "1.0.12"
|
||||
resolved "https://registry.yarnpkg.com/joplin-turndown-plugin-gfm/-/joplin-turndown-plugin-gfm-1.0.12.tgz#f0774183177895c6fedeec951053cab6046dede8"
|
||||
integrity sha512-qL4+1iycQjZ1fs8zk3jSRk7cg3ROBUHk7GKtiLAQLFzLPKErnILUvz5DLszSQvz3s1sTjPbywLDISVUtBY6HaA==
|
||||
|
||||
js-beautify@^1.8.8:
|
||||
version "1.11.0"
|
||||
resolved "https://registry.yarnpkg.com/js-beautify/-/js-beautify-1.11.0.tgz#afb873dc47d58986360093dcb69951e8bcd5ded2"
|
||||
|
@ -9270,6 +9275,13 @@ querystring@0.2.0, querystring@^0.2.0:
|
|||
resolved "https://registry.yarnpkg.com/querystring/-/querystring-0.2.0.tgz#b209849203bb25df820da756e747005878521620"
|
||||
integrity sha1-sgmEkgO7Jd+CDadW50cAWHhSFiA=
|
||||
|
||||
quoted-printable@^1.0.1:
|
||||
version "1.0.1"
|
||||
resolved "https://registry.yarnpkg.com/quoted-printable/-/quoted-printable-1.0.1.tgz#9eebf5eb3d11eef022b264fd2d2b6b2bb3b84cc3"
|
||||
integrity sha1-nuv16z0R7vAismT9LStrK7O4TMM=
|
||||
dependencies:
|
||||
utf8 "^2.1.0"
|
||||
|
||||
randombytes@^2.0.0, randombytes@^2.0.1, randombytes@^2.0.5, randombytes@^2.1.0:
|
||||
version "2.1.0"
|
||||
resolved "https://registry.yarnpkg.com/randombytes/-/randombytes-2.1.0.tgz#df6f84372f0270dc65cdf6291349ab7a473d4f2a"
|
||||
|
@ -11619,6 +11631,11 @@ use@^3.1.0:
|
|||
resolved "https://registry.yarnpkg.com/use/-/use-3.1.1.tgz#d50c8cac79a19fbc20f2911f56eb973f4e10070f"
|
||||
integrity sha512-cwESVXlO3url9YWlFW/TA9cshCEhtu7IKJ/p5soJ/gGpj7vbvFrAY/eIioQ6Dw23KjZhYgiIo8HOs1nQ2vr/oQ==
|
||||
|
||||
utf8@^2.1.0:
|
||||
version "2.1.2"
|
||||
resolved "https://registry.yarnpkg.com/utf8/-/utf8-2.1.2.tgz#1fa0d9270e9be850d9b05027f63519bf46457d96"
|
||||
integrity sha1-H6DZJw6b6FDZsFAn9jUZv0ZFfZY=
|
||||
|
||||
util-deprecate@^1.0.1, util-deprecate@~1.0.1:
|
||||
version "1.0.2"
|
||||
resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf"
|
||||
|
|
Reference in New Issue