poi0020,大檔案讀取,有效解決記憶體溢位。
阿新 • • 發佈:2018-12-19
解決思路,一次載入少量檔案進記憶體,處理完成後,再載入下一批,不斷迴圈。難點:傳統的使用者模式讀取方式沒法完成,使用者模式是一次性將檔案內容全部載入到記憶體中;所以需要引入另外一種事件模式,事件模式是一次讀取一條進入檔案。
程式碼實現:
1、建立maven工程,匯入專案依賴
<dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.17</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.17</version> </dependency> <dependency> <groupId>xerces</groupId> <artifactId>xercesImpl</artifactId> <version>2.11.0</version> </dependency>
2、建立xlsx的工具類
package com.honor; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.ss.usermodel.BuiltinFormats; import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.xssf.eventusermodel.XSSFReader; import org.apache.poi.xssf.model.SharedStringsTable; import org.apache.poi.xssf.model.StylesTable; import org.apache.poi.xssf.usermodel.XSSFCellStyle; import org.apache.poi.xssf.usermodel.XSSFRichTextString; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; import java.io.InputStream; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.List; /** * 用於解決office2007版本大資料量問題 **/ public class XlsxReader extends DefaultHandler { private Reader reader; private Method method; public XlsxReader(Reader reader, String methodName) { this.reader = reader; try { this.method = reader.getClass().getMethod(methodName,String.class, String.class, int.class, int.class, List.class); } catch (NoSuchMethodException e) { e.printStackTrace(); } } // 單元格中的資料可能的資料型別 enum CellDataType { BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER, DATE, NULL } // 共享字串表 private SharedStringsTable sst; //上一次的索引值 private String lastIndex; // 檔案的絕對路徑 private String filePath = ""; // 工作表索引 private int sheetIndex = 0; // sheet名 private String sheetName = ""; // 總行數 private int totalRows = 0; // 一行內cell集合 private List<String> cellList = new ArrayList<String>(); // 判斷整行是否為空行的標記 private boolean flag = false; // 當前行 private int curRow = 1; //當前列 private int curCol = 0; // T元素標識 private boolean isTElement; // 異常資訊,如果為空則表示沒有異常 private String exceptionMessage; // 單元格資料型別,預設為字串型別 private CellDataType nextDataType = CellDataType.SSTINDEX; private final DataFormatter formatter = new DataFormatter(); // 單元格日期格式的索引 private short formatIndex; // 日期格式字串 private String formatString; //定義前一個元素和當前元素的位置,用來計算其中空的單元格數量,如A6和A8等 private String preRef = null, ref = null; //定義該文件一行最大的單元格數,用來補全一行最後可能缺失的單元格 private String maxRef = null; //在工作簿中共享所有表格的樣式表 private StylesTable stylesTable; /** * 遍歷工作簿中所有的電子表格 * 並快取在mySheetList中 * * @param filename * @throws Exception */ public int process(String filename) throws Exception { filePath = filename; OPCPackage pkg = OPCPackage.open(filename); XSSFReader xssfReader = new XSSFReader(pkg); stylesTable = xssfReader.getStylesTable(); SharedStringsTable sst = xssfReader.getSharedStringsTable(); XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser"); this.sst = sst; parser.setContentHandler(this); XSSFReader.SheetIterator sheets = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); //遍歷sheet while (sheets.hasNext()) { //標記初始行為第一行 curRow = 1; sheetIndex++; //sheets.next()和sheets.getSheetName()不能換位置,否則sheetName報錯 InputStream sheet = sheets.next(); sheetName = sheets.getSheetName(); InputSource sheetSource = new InputSource(sheet); //解析excel的每條記錄,在這個過程中startElement()、characters()、endElement()這三個函式會依次執行 parser.parse(sheetSource); sheet.close(); } //返回該excel檔案的總行數,不包括首列和空行 return totalRows; } /** * 第一個執行 * * @param uri * @param localName * @param name * @param attributes * @throws SAXException */ @Override public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException { //c => 單元格 if ("c".equals(name)) { //前一個單元格的位置 if (preRef == null) { preRef = attributes.getValue("r"); } else { preRef = ref; } //當前單元格的位置 ref = attributes.getValue("r"); //設定單元格型別 this.setNextDataType(attributes); } //當元素為t時 if ("t".equals(name)) { isTElement = true; } else { isTElement = false; } //置空 lastIndex = ""; } /** * 第二個執行 * 得到單元格對應的索引值或是內容值 * 如果單元格型別是字串、INLINESTR、數字、日期,lastIndex則是索引值 * 如果單元格型別是布林值、錯誤、公式,lastIndex則是內容值 * * @param ch * @param start * @param length * @throws SAXException */ @Override public void characters(char[] ch, int start, int length) throws SAXException { lastIndex += new String(ch, start, length); } /** * 第三個執行 * * @param uri * @param localName * @param name * @throws SAXException */ @Override public void endElement(String uri, String localName, String name) throws SAXException { //t元素也包含字串 if (isTElement) {//這個程式沒經過 //將單元格內容加入rowlist中,在這之前先去掉字串前後的空白符 String value = lastIndex.trim(); cellList.add(curCol, value); curCol++; isTElement = false; //如果裡面某個單元格含有值,則標識該行不為空行 if (value != null && !"".equals(value)) { flag = true; } } else if ("v".equals(name)) { //v => 單元格的值,如果單元格是字串,則v標籤的值為該字串在SST中的索引 String value = this.getDataValue(lastIndex.trim(), "");//根據索引值獲取對應的單元格值 //補全單元格之間的空單元格 if (!ref.equals(preRef)) { int len = countNullCell(ref, preRef); for (int i = 0; i < len; i++) { cellList.add(curCol, ""); curCol++; } } cellList.add(curCol, value); curCol++; //如果裡面某個單元格含有值,則標識該行不為空行 if (value != null && !"".equals(value)) { flag = true; } } else { //如果標籤名稱為row,這說明已到行尾,呼叫optRows()方法 if ("row".equals(name)) { //預設第一行為表頭,以該行單元格數目為最大數目 if (curRow == 1) { maxRef = ref; } //補全一行尾部可能缺失的單元格 if (maxRef != null) { int len = countNullCell(maxRef, ref); for (int i = 0; i <= len; i++) { cellList.add(curCol, ""); curCol++; } } //該行不為空行且該行不是第一行,則傳送(第一行為列名,不需要) if (flag && curRow != 1) { try { method.invoke(reader, filePath, sheetName, sheetIndex, curRow, cellList); } catch (IllegalAccessException e) { e.printStackTrace(); } catch (InvocationTargetException e) { e.printStackTrace(); } totalRows++; } cellList.clear(); curRow++; curCol = 0; preRef = null; ref = null; flag = false; } } } /** * 處理資料型別 * * @param attributes */ public void setNextDataType(Attributes attributes) { //cellType為空,則表示該單元格型別為數字 nextDataType = CellDataType.NUMBER; formatIndex = -1; formatString = null; //單元格型別 String cellType = attributes.getValue("t"); String cellStyleStr = attributes.getValue("s"); //獲取單元格的位置,如A1,B1 // String columnData = attributes.getValue("r"); if ("b".equals(cellType)) { //處理布林值 nextDataType = CellDataType.BOOL; } else if ("e".equals(cellType)) { //處理錯誤 nextDataType = CellDataType.ERROR; } else if ("inlineStr".equals(cellType)) { nextDataType = CellDataType.INLINESTR; } else if ("s".equals(cellType)) { //處理字串 nextDataType = CellDataType.SSTINDEX; } else if ("str".equals(cellType)) { nextDataType = CellDataType.FORMULA; } if (cellStyleStr != null) { //處理日期 int styleIndex = Integer.parseInt(cellStyleStr); XSSFCellStyle style = stylesTable.getStyleAt(styleIndex); formatIndex = style.getDataFormat(); formatString = style.getDataFormatString(); if (formatString.contains("m/d/yy")) { nextDataType = CellDataType.DATE; formatString = "yyyy-MM-dd hh:mm:ss"; } if (formatString == null) { nextDataType = CellDataType.NULL; formatString = BuiltinFormats.getBuiltinFormat(formatIndex); } } } /** * 對解析出來的資料進行型別處理 * * @param value 單元格的值, * value代表解析:BOOL的為0或1, ERROR的為內容值,FORMULA的為內容值,INLINESTR的為索引值需轉換為內容值, * SSTINDEX的為索引值需轉換為內容值, NUMBER為內容值,DATE為內容值 * @param thisStr 一個空字串 * @return */ @SuppressWarnings("deprecation") public String getDataValue(String value, String thisStr) { switch (nextDataType) { // 這幾個的順序不能隨便交換,交換了很可能會導致資料錯誤 case BOOL: //布林值 char first = value.charAt(0); thisStr = first == '0' ? "FALSE" : "TRUE"; break; case ERROR: //錯誤 thisStr = "\"ERROR:" + value.toString() + '"'; break; case FORMULA: //公式 thisStr = '"' + value.toString() + '"'; break; case INLINESTR: XSSFRichTextString rtsi = new XSSFRichTextString(value.toString()); thisStr = rtsi.toString(); rtsi = null; break; case SSTINDEX: //字串 String sstIndex = value.toString(); try { int idx = Integer.parseInt(sstIndex); XSSFRichTextString rtss = new XSSFRichTextString(sst.getEntryAt(idx));//根據idx索引值獲取內容值 thisStr = rtss.toString(); rtss = null; } catch (NumberFormatException ex) { thisStr = value.toString(); } break; case NUMBER: //數字 if (formatString != null) { thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString).trim(); } else { thisStr = value; } thisStr = thisStr.replace("_", "").trim(); break; case DATE: //日期 thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString); // 對日期字串作特殊處理,去掉T thisStr = thisStr.replace("T", " "); break; default: thisStr = " "; break; } return thisStr; } public int countNullCell(String ref, String preRef) { //excel2007最大行數是1048576,最大列數是16384,最後一列列名是XFD String xfd = ref.replaceAll("\\d+", ""); String xfd_1 = preRef.replaceAll("\\d+", ""); xfd = fillChar(xfd, 3, '@', true); xfd_1 = fillChar(xfd_1, 3, '@', true); char[] letter = xfd.toCharArray(); char[] letter_1 = xfd_1.toCharArray(); int res = (letter[0] - letter_1[0]) * 26 * 26 + (letter[1] - letter_1[1]) * 26 + (letter[2] - letter_1[2]); return res - 1; } public String fillChar(String str, int len, char let, boolean isPre) { int len_1 = str.length(); if (len_1 < len) { if (isPre) { for (int i = 0; i < (len - len_1); i++) { str = let + str; } } else { for (int i = 0; i < (len - len_1); i++) { str = str + let; } } } return str; } /** * @return the exceptionMessage */ public String getExceptionMessage() { return exceptionMessage; } }
注:工具類中有個Reader介面,這是一個數據返回的介面,需要使用大檔案讀取資料的類,需要繼承這個介面,然後資料會在自己實現的方法中返回,名稱可以換,引數固定
3、Reader介面
package com.honor; import java.util.List; /** * 用來放回使用者資訊 * * @author rongyaowen * @create 2018-11-02 23:50 **/ public interface Reader { /** * 資料放回介面 * * @param filePath 檔案路徑 * @param sheetName sheet名稱 * @param sheetIndex sheet序號 * @param curRow 行號 * @param cellList 一行的所有單元格欄位 */ public void read(String filePath, String sheetName, int sheetIndex, int curRow, List<String> cellList); }
這邊的read方法相當於一個模板方法,如果你的業務需求只要一次讀取大檔案(一次業務邏輯處理),那麼直接實現這個方法就可以了,如果需要多次讀取大檔案,且需要做不同的業務邏輯處理,那麼在實現類中,仿造寫一個只有名稱喝read不同方法。後面有具體例子。
4、Test類
package com.honor;
import java.util.List;
/**
* @author y
* @create 2018-01-19 0:13
* @desc
**/
public class ReaderTest implements Reader {
//excel2003副檔名
public static final String EXCEL03_EXTENSION = ".xls";
//excel2007副檔名
public static final String EXCEL07_EXTENSION = ".xlsx";
public static void main(String[] args) throws Exception {
String filePath = "C:\\Users\\Administrator\\Desktop\\test.xlsx";
new ReaderTest().read(filePath);
}
public void read(String filePath) throws Exception {
int totalRows = 0;
if (filePath.endsWith(EXCEL03_EXTENSION)) { //處理excel2003檔案
XlsReader excelXls = new XlsReader(this, "read");
totalRows = excelXls.process(filePath);
} else if (filePath.endsWith(EXCEL07_EXTENSION)) {//處理excel2007檔案
XlsxReader excelXlsxReader = new XlsxReader(this, "read");
totalRows = excelXlsxReader.process(filePath);
} else {
throw new Exception("檔案格式錯誤,fileName的副檔名只能是xls或xlsx。");
}
System.out.println("傳送的總行數:" + totalRows);
}
@Override
public void read(String filePath, String sheetName, int sheetIndex, int curRow, List<String> cellList) {
System.out.println("read");
System.out.println(filePath + ";" + sheetName + ";" + sheetIndex + ";" + curRow + ";" + cellList);
}
}
注:這邊Test類實現了Reader介面,資料會從read方法返回,new XlsxReader()的第一個引數是實現了Reader介面的物件引用,第二個引數是指定在名為這個的方法中返回,這個方法名稱可以變,但是引數不能變。只需要一次邏輯處理的例子如下
例如我讀的excel表格內容如下:
輸出結果:
例2、需要在test類中讀取兩個大檔案,做不同的業務邏輯處理
只需要zaitest類中新增一個名稱和read不一樣,引數一樣的方法,如下
public void read2(String filePath, String sheetName, int sheetIndex, int curRow, List<String> cellList) {
System.out.println("read2");
System.out.println(filePath + ";" + sheetName + ";" + sheetIndex + ";" + curRow + ";" + cellList);
}
輸出介面如下
4、xls大檔案的讀取,其實這個沒有什麼用因為2003的excel最大隻能儲存6萬多條,無所謂大檔案。
package com.honor;
import org.apache.poi.hssf.eventusermodel.*;
import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord;
import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord;
import org.apache.poi.hssf.model.HSSFFormulaParser;
import org.apache.poi.hssf.record.*;
import org.apache.poi.hssf.usermodel.HSSFDataFormatter;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import java.io.FileInputStream;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.List;
/**
* @author y
* @create 2018-01-19 14:18
* @desc 用於解決.xls2003版本大資料量問題
**/
public class XlsReader implements HSSFListener {
private Reader reader;
private Method method;
public XlsReader(Reader reader, String methodName) {
this.reader = reader;
try {
this.method = reader.getClass().getMethod(methodName, String.class, String.class, int.class, int.class, List.class);
} catch (NoSuchMethodException e) {
e.printStackTrace();
}
}
private int minColums = -1;
private POIFSFileSystem fs;
/**
* 總行數
*/
private int totalRows = 0;
/**
* 上一行row的序號
*/
private int lastRowNumber;
/**
* 上一單元格的序號
*/
private int lastColumnNumber;
/**
* 是否輸出formula,還是它對應的值
*/
private boolean outputFormulaValues = true;
/**
* 用於轉換formulas
*/
private EventWorkbookBuilder.SheetRecordCollectingListener workbookBuildingListener;
//excel2003工作簿
private HSSFWorkbook stubWorkbook;
private SSTRecord sstRecord;
private FormatTrackingHSSFListener formatListener;
private final HSSFDataFormatter formatter = new HSSFDataFormatter();
/**
* 檔案的絕對路徑
*/
private String filePath = "";
//表索引
private int sheetIndex = 0;
private BoundSheetRecord[] orderedBSRs;
@SuppressWarnings("unchecked")
private ArrayList boundSheetRecords = new ArrayList();
private int nextRow;
private int nextColumn;
private boolean outputNextStringRecord;
//當前行
private int curRow = 0;
//儲存一行記錄所有單元格的容器
private List<String> cellList = new ArrayList<String>();
/**
* 判斷整行是否為空行的標記
*/
private boolean flag = false;
@SuppressWarnings("unused")
private String sheetName;
/**
* 遍歷excel下所有的sheet
*
* @param fileName
* @throws Exception
*/
public int process(String fileName) throws Exception {
filePath = fileName;
this.fs = new POIFSFileSystem(new FileInputStream(fileName));
MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);
formatListener = new FormatTrackingHSSFListener(listener);
HSSFEventFactory factory = new HSSFEventFactory();
HSSFRequest request = new HSSFRequest();
if (outputFormulaValues) {
request.addListenerForAllRecords(formatListener);
} else {
workbookBuildingListener = new EventWorkbookBuilder.SheetRecordCollectingListener(formatListener);
request.addListenerForAllRecords(workbookBuildingListener);
}
factory.processWorkbookEvents(request, fs);
return totalRows; //返回該excel檔案的總行數,不包括首列和空行
}
/**
* HSSFListener 監聽方法,處理Record
* 處理每個單元格
*
* @param record
*/
@SuppressWarnings("unchecked")
public void processRecord(Record record) {
int thisRow = -1;
int thisColumn = -1;
String thisStr = null;
String value = null;
switch (record.getSid()) {
case BoundSheetRecord.sid:
boundSheetRecords.add(record);
break;
case BOFRecord.sid: //開始處理每個sheet
BOFRecord br = (BOFRecord) record;
if (br.getType() == BOFRecord.TYPE_WORKSHEET) {
//如果有需要,則建立子工作簿
if (workbookBuildingListener != null && stubWorkbook == null) {
stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook();
}
if (orderedBSRs == null) {
orderedBSRs = BoundSheetRecord.orderByBofPosition(boundSheetRecords);
}
sheetName = orderedBSRs[sheetIndex].getSheetname();
sheetIndex++;
}
break;
case SSTRecord.sid:
sstRecord = (SSTRecord) record;
break;
case BlankRecord.sid: //單元格為空白
BlankRecord brec = (BlankRecord) record;
thisRow = brec.getRow();
thisColumn = brec.getColumn();
thisStr = "";
cellList.add(thisColumn, thisStr);
break;
case BoolErrRecord.sid: //單元格為布林型別
BoolErrRecord berec = (BoolErrRecord) record;
thisRow = berec.getRow();
thisColumn = berec.getColumn();
thisStr = berec.getBooleanValue() + "";
cellList.add(thisColumn, thisStr);
checkRowIsNull(thisStr); //如果裡面某個單元格含有值,則標識該行不為空行
break;
case FormulaRecord.sid://單元格為公式型別
FormulaRecord frec = (FormulaRecord) record;
thisRow = frec.getRow();
thisColumn = frec.getColumn();
if (outputFormulaValues) {
if (Double.isNaN(frec.getValue())) {
outputNextStringRecord = true;
nextRow = frec.getRow();
nextColumn = frec.getColumn();
} else {
thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
}
} else {
thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
}
cellList.add(thisColumn, thisStr);
checkRowIsNull(thisStr); //如果裡面某個單元格含有值,則標識該行不為空行
break;
case StringRecord.sid: //單元格中公式的字串
if (outputNextStringRecord) {
StringRecord srec = (StringRecord) record;
thisStr = srec.getString();
thisRow = nextRow;
thisColumn = nextColumn;
outputNextStringRecord = false;
}
break;
case LabelRecord.sid:
LabelRecord lrec = (LabelRecord) record;
curRow = thisRow = lrec.getRow();
thisColumn = lrec.getColumn();
value = lrec.getValue().trim();
value = value.equals("") ? "" : value;
cellList.add(thisColumn, value);
checkRowIsNull(value); //如果裡面某個單元格含有值,則標識該行不為空行
break;
case LabelSSTRecord.sid: //單元格為字串型別
LabelSSTRecord lsrec = (LabelSSTRecord) record;
curRow = thisRow = lsrec.getRow();
thisColumn = lsrec.getColumn();
if (sstRecord == null) {
cellList.add(thisColumn, "");
} else {
value = sstRecord.getString(lsrec.getSSTIndex()).toString().trim();
value = value.equals("") ? "" : value;
cellList.add(thisColumn, value);
checkRowIsNull(value); //如果裡面某個單元格含有值,則標識該行不為空行
}
break;
case NumberRecord.sid: //單元格為數字型別
NumberRecord numrec = (NumberRecord) record;
curRow = thisRow = numrec.getRow();
thisColumn = numrec.getColumn();
//第一種方式
//value = formatListener.formatNumberDateCell(numrec).trim();//這個被寫死,採用的m/d/yy h:mm格式,不符合要求
//第二種方式,參照formatNumberDateCell裡面的實現方法編寫
Double valueDouble = ((NumberRecord) numrec).getValue();
String formatString = formatListener.getFormatString(numrec);
if (formatString.contains("m/d/yy")) {
formatString = "yyyy-MM-dd hh:mm:ss";
}
int formatIndex = formatListener.getFormatIndex(numrec);
value = formatter.formatRawCellContents(valueDouble, formatIndex, formatString).trim();
value = value.equals("") ? "" : value;
//向容器加入列值
cellList.add(thisColumn, value);
checkRowIsNull(value); //如果裡面某個單元格含有值,則標識該行不為空行
break;
default:
break;
}
//遇到新行的操作
if (thisRow != -1 && thisRow != lastRowNumber) {
lastColumnNumber = -1;
}
//空值的操作
if (record instanceof MissingCellDummyRecord) {
MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
curRow = thisRow = mc.getRow();
thisColumn = mc.getColumn();
cellList.add(thisColumn, "");
}
//更新行和列的值
if (thisRow > -1)
lastRowNumber = thisRow;
if (thisColumn > -1)
lastColumnNumber = thisColumn;
//行結束時的操作
if (record instanceof LastCellOfRowDummyRecord) {
if (minColums > 0) {
//列值重新置空
if (lastColumnNumber == -1) {
lastColumnNumber = 0;
}
}
lastColumnNumber = -1;
if (flag && curRow != 0) { //該行不為空行且該行不是第一行,傳送(第一行為列名,不需要)
try {
// 通過反射呼叫,這樣在實現類中就可以多次使用這個方法
method.invoke(reader, filePath, sheetName, sheetIndex, curRow + 1, cellList);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
totalRows++;
}
//清空容器
cellList.clear();
flag = false;
}
}
/**
* 如果裡面某個單元格含有值,則標識該行不為空行
*
* @param value
*/
public void checkRowIsNull(String value) {
if (value != null && !"".equals(value)) {
flag = true;
}
}
}
注:以上的方法參考網上部落格,我這邊只是做了一下反射方法的封裝處理,使一個實現類方法可以讀取excel檔案走多套邏輯處理