Tika
官网:https://tika.apache.org/3.3.1/examples.html
引入依赖
1 2 3 4 5 6 7 8 9 10 11 12
| <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-core</artifactId> <version>3.2.3</version> <scope>compile</scope> </dependency> <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-parsers-standard-package</artifactId> <version>3.2.3</version> <scope>compile</scope> </dependency>
|
tika-core 提供 API 层,tika-parsers-standard-package 包含所有主流格式的解析器实现。
示例
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
|
public String parseContent(MultipartFile file) { String fileName = file.getOriginalFilename(); log.info("开始解析文件: {}", fileName);
if (file.isEmpty() || file.getSize() == 0) { log.warn("文件为空: {}", fileName); return ""; }
try (InputStream inputStream = file.getInputStream()) { String content = parseContent(inputStream); String cleanedContent = textCleaningService.cleanText(content); log.info("文件解析成功,提取文本长度: {} 字符", cleanedContent.length()); return cleanedContent; } catch (IOException | TikaException | SAXException e) { log.error("文件解析失败: {}", e.getMessage(), e); throw new BusinessException(ErrorCode.INTERNAL_ERROR, "文件解析失败: " + e.getMessage()); } }
private String parseContent(InputStream inputStream) throws IOException, TikaException, SAXException { AutoDetectParser parser = new AutoDetectParser();
BodyContentHandler handler = new BodyContentHandler(MAX_TEXT_LENGTH);
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
context.set(EmbeddedDocumentExtractor.class, new NoOpEmbeddedDocumentExtractor());
PDFParserConfig pdfConfig = new PDFParserConfig(); pdfConfig.setExtractInlineImages(false); pdfConfig.setSortByPosition(true); context.set(PDFParserConfig.class, pdfConfig);
parser.parse(inputStream, handler, metadata, context);
return handler.toString(); }
|
一般文本提取之后,都需要进行二次清洗,比如通过正则表达式。