feat: Add support for "word" files exported from Confluence (#1600)
* Display error message to end user * fix: Improve conversion of tables * fix: Characters at ends of lines in tables lost
This commit is contained in:
@ -1,11 +1,14 @@
|
||||
// @flow
|
||||
import fs from "fs";
|
||||
import File from "formidable/lib/file";
|
||||
import { strikethrough, tables } from "joplin-turndown-plugin-gfm";
|
||||
import mammoth from "mammoth";
|
||||
import quotedPrintable from "quoted-printable";
|
||||
import TurndownService from "turndown";
|
||||
import utf8 from "utf8";
|
||||
import uuid from "uuid";
|
||||
import parseTitle from "../../shared/utils/parseTitle";
|
||||
import { InvalidRequestError } from "../errors";
|
||||
import { FileImportError, InvalidRequestError } from "../errors";
|
||||
import { Attachment, Event, User } from "../models";
|
||||
import dataURItoBuffer from "../utils/dataURItoBuffer";
|
||||
import parseImages from "../utils/parseImages";
|
||||
@ -18,12 +21,28 @@ const turndownService = new TurndownService({
|
||||
headingStyle: "atx",
|
||||
});
|
||||
|
||||
// Use the GitHub-flavored markdown plugin to parse
|
||||
// strikethoughs and tables
|
||||
turndownService
|
||||
.use(strikethrough)
|
||||
.use(tables)
|
||||
.addRule("breaks", {
|
||||
filter: ["br"],
|
||||
replacement: function (content) {
|
||||
return "\n";
|
||||
},
|
||||
});
|
||||
|
||||
interface ImportableFile {
|
||||
type: string;
|
||||
getMarkdown: (file: any) => Promise<string>;
|
||||
}
|
||||
|
||||
const importMapping: ImportableFile[] = [
|
||||
{
|
||||
type: "application/msword",
|
||||
getMarkdown: confluenceToMarkdown,
|
||||
},
|
||||
{
|
||||
type:
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
@ -57,6 +76,60 @@ async function htmlToMarkdown(file): Promise<string> {
|
||||
return turndownService.turndown(value);
|
||||
}
|
||||
|
||||
async function confluenceToMarkdown(file): Promise<string> {
|
||||
let value = await fs.promises.readFile(file.path, "utf8");
|
||||
|
||||
// We're only supporting the ridiculous output from Confluence here, regular
|
||||
// Word documents should call into the docxToMarkdown importer.
|
||||
// See: https://jira.atlassian.com/browse/CONFSERVER-38237
|
||||
if (!value.includes("Content-Type: multipart/related")) {
|
||||
throw new FileImportError("Unsupported Word file");
|
||||
}
|
||||
|
||||
// get boundary marker
|
||||
const boundaryMarker = value.match(/boundary="(.+)"/);
|
||||
if (!boundaryMarker) {
|
||||
throw new FileImportError("Unsupported Word file (No boundary marker)");
|
||||
}
|
||||
|
||||
// get content between multipart boundaries
|
||||
let boundaryReached = 0;
|
||||
const lines = value.split("\n").filter((line) => {
|
||||
if (line.includes(boundaryMarker[1])) {
|
||||
boundaryReached++;
|
||||
return false;
|
||||
}
|
||||
if (line.startsWith("Content-")) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 1 == definition
|
||||
// 2 == content
|
||||
// 3 == ending
|
||||
if (boundaryReached === 2) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
if (!lines.length) {
|
||||
throw new FileImportError("Unsupported Word file (No content found)");
|
||||
}
|
||||
|
||||
// Mime attachment is "quoted printable" encoded, must be decoded first
|
||||
// https://en.wikipedia.org/wiki/Quoted-printable
|
||||
value = utf8.decode(quotedPrintable.decode(lines.join("\n")));
|
||||
|
||||
// If we don't remove the title here it becomes printed in the document
|
||||
// body by turndown
|
||||
turndownService.remove(["style", "xml", "title"]);
|
||||
|
||||
// Now we should have something that looks like HTML
|
||||
const html = turndownService.turndown(value);
|
||||
|
||||
return html.replace(/<br>/g, " \\n ");
|
||||
}
|
||||
|
||||
export default async function documentImporter({
|
||||
file,
|
||||
user,
|
||||
|
Reference in New Issue
Block a user