1. 程式人生 > >利用POI框架的SAX方式處理大資料2007版Excel(xlsx)【第2版】

利用POI框架的SAX方式處理大資料2007版Excel(xlsx)【第2版】

【第1版】地址

針對老早寫的POI處理Excel的大資料讀取問題,看到好多人關注,感覺自己還是更新一版,畢竟雖然是自己備份,但是如果新手能少走彎路,也算欣慰。下面的版本是我的專案迭代過程中個人認為畢竟穩定和健壯的,算作【第2版】吧,裡面修復了【第1版】的很多bug,諸如計算前後單元格差值的函式getLevel,以及識別新行的正則上也略有優化,最後希望,這段程式碼能為你爭取更多的學習時間,而不是懶惰的藉口。

package com.fulong.utils.poi;

import java.io.File;

/* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
==================================================================== */

import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

import com.fulong.utils.report.tool.MathUtil;

/**
 * XSSF and SAX (Event API) basic example. See {@link XLSX2CSV} for a fuller
 * example of doing XSLX processing with the XSSF Event code.
 * 目前函式又個缺陷,便是starElement函式中發現新行的策略有問題
 */
public class MyExcel2007ForPaging_high {

	/**
	 * 代表Excel中必須有值的起始列(A、B、C....AA、AB...)
	 */
	private static final String indexC4Data = "A";
	
	/**
	 * 儲存所有行的值
	 */
	public List<List<IndexValue>> dataList = new ArrayList<List<IndexValue>>();
	/*
	 * 臨時儲存當前行的值
	 */
	private List<IndexValue> rowData;
	
	private final int startRow;
	private final int endRow;
	private int currentRow = 0;
	
	private final String filename;
	

	public MyExcel2007ForPaging_high(String filename, int startRow, int endRow) throws Exception {
		if (StringUtils.isBlank(filename))
			throw new Exception("檔名不能空");
		this.filename = filename;
		this.startRow = startRow;
		this.endRow = endRow;
		processFirstSheet();
	}

	/**
	 * 指定獲取第一個sheet
	 * 
	 * @param filename
	 * @throws Exception
	 */
	private void processFirstSheet() throws Exception {
		OPCPackage pkg = OPCPackage.open(filename);
		XSSFReader r = new XSSFReader(pkg);
		SharedStringsTable sst = r.getSharedStringsTable();

		XMLReader parser = fetchSheetParser(sst);

		// To look up the Sheet Name / Sheet Order / rID,
		// you need to process the core Workbook stream.
		// Normally it's of the form rId# or rSheet#
		InputStream sheet1 = r.getSheet("rId1");
		InputSource sheetSource = new InputSource(sheet1);
		parser.parse(sheetSource);
		sheet1.close();
		pkg.close();
	}

	private XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
		XMLReader parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
		ContentHandler handler = new PagingHandler(sst);
		parser.setContentHandler(handler);
		return parser;
	}

	/**
	 * See org.xml.sax.helpers.DefaultHandler javadocs
	 */
	private class PagingHandler extends DefaultHandler {
		private SharedStringsTable sst;
		private String lastContents;
		private boolean nextIsString;
		private String index = null;

		private PagingHandler(SharedStringsTable sst) {
			this.sst = sst;
		}

		/**
		 * 每個單元格開始時的處理
		 */
		@Override
		public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
			// c => cell
			if (name.equals("c")) {
				// Print the cell reference
				// System.out.print(attributes.getValue("r") + " - ");

				index = attributes.getValue("r");
				System.out.println(index);
				if (index.contains("N")) {
					System.out.println("##" + attributes + "##");
				}

				// 這是一個新行
				if (Pattern.compile("^"+indexC4Data+"[0-9]+$").matcher(index).find()) {

					// 儲存上一行資料
					if (rowData != null && isAccess() && !rowData.isEmpty()) {
						dataList.add(rowData);
					}
					rowData = new ArrayList<IndexValue>();
					;// 新行要先清除上一行的資料
					currentRow++;// 當前行+1
					// System.out.println(currentRow);
				}
				if (isAccess()) {
					// Figure out if the value is an index in the SST
					String cellType = attributes.getValue("t");
					if (cellType != null && cellType.equals("s")) {
						nextIsString = true;
					} else {
						nextIsString = false;
					}
				}

			}
			// Clear contents cache
			lastContents = "";
		}

		/**
		 * 每個單元格結束時的處理
		 */
		@Override
		public void endElement(String uri, String localName, String name) throws SAXException {
			if (isAccess()) {
				// Process the last contents as required.
				// Do now, as characters() may be called more than once
				if (nextIsString) {
					int idx = Integer.parseInt(lastContents);
					lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
					nextIsString = false;
				}

				// v => contents of a cell
				// Output after we've seen the string contents
				if (name.equals("v")) {
					// System.out.println(lastContents);

					rowData.add(new IndexValue(index, lastContents));

				}
			}

		}

		/**
		 * 目前流的方式值支援  Excel單元格是文字  格式;日期、數字、公式不支援
		 */
		@Override
		public void characters(char[] ch, int start, int length) throws SAXException {
			if (isAccess()) {
				lastContents += new String(ch, start, length);
			}

		}

		/**
		 * 如果文件結束後,發現讀取的末尾行正處在當前行中,儲存下這行
		 * (存在這樣一種情況,當待讀取的末尾行正好是文件最後一行時,最後一行無法存到集合中,
		 * 因為最後一行沒有下一行了,所以不為啟動starElement()方法, 當然我們可以通過指定最大列來處理,但不想那麼做,擴充套件性不好)
		 */
		@Override
		public void endDocument() throws SAXException {
			if (rowData != null && isAccess() && !rowData.isEmpty()) {
				dataList.add(rowData);
				System.out.println("--end");
			}

		}

	}

	private boolean isAccess() {
		if (currentRow >= startRow && currentRow <= endRow) {
			return true;
		}
		return false;
	}

	private class IndexValue {
		String v_index;
		String v_value;

		public IndexValue(String v_index, String v_value) {
			super();
			this.v_index = v_index;
			this.v_value = v_value;
		}

		@Override
		public String toString() {
			return "IndexValue [v_index=" + v_index + ", v_value=" + v_value + "]";
		}

		/**
		 * 去掉數字部分(行資訊),直接比較英文部分(列資訊),計算前後兩個值相距多少空列
		 * @param p
		 * @return
		 */
		public int getLevel(IndexValue p) {
			
			/*char[] other = p.v_index.replaceAll("[0-9]", "").toCharArray();
			char[] self = this.v_index.replaceAll("[0-9]", "").toCharArray();
			if (other.length != self.length)
				return -1;
			for (int i = 0; i < other.length; i++) {
				if (i == other.length - 1) {
					return self[i] - other[i];
				} else {
					if (self[i] != other[i]) {
						return -1;
					}
				}

			}
			return -1;*/
			
			String other = p.v_index.replaceAll("[0-9]", "");
			String self = this.v_index.replaceAll("[0-9]", "");
			return MathUtil.fromNumberSystem26(self)-MathUtil.fromNumberSystem26(other);

		}
	}

	/**
	 * 獲取真實的資料(處理空格)
	 * 
	 * @return
	 * @throws Exception
	 */
	public List<List<String>> getMyDataList() throws Exception {

		List<List<String>> myDataList = new ArrayList<List<String>>();
		if (dataList == null || dataList.size() <= 0)
			return myDataList;
		/*
		 * 是否是最後一行的資料
		 */
		boolean islastRow = false;
		for (int i = 0; i < dataList.size(); i++) {
			List<IndexValue> i_list = dataList.get(i);
			List<String> row = new ArrayList<String>();
			int j = 0;
			
			for (; j < i_list.size() - 1; j++) {
				// 獲取當前值,並存儲
				IndexValue current = i_list.get(j);
				//去掉空格
				String tempV = current.v_value!=null?current.v_value.trim():current.v_value;
				row.add(tempV);
				// 預存下一個
				IndexValue next = i_list.get(j + 1);
				// 獲取差值
				int level = next.getLevel(current);
				/*if(i==2214){
					System.out.println("--"+i);
				}*/
				if (level <= 0){
					System.err.println("---!!!到達最後一行,行號:"+(i+1)+";level:"+level+"[超出處理範圍]");
					islastRow = true;
					break;
				}
				//將差值補充為null,
				for (int k = 0; k < level - 1; k++) {
					row.add(null);
				}
			}
			/*
			 * 每行的最後一個值,留在最後插入
			 * 但最後一行除外
			 */
			if(!islastRow){
				row.add(i_list.get(j).v_value);
			}
			myDataList.add(row);

		}
		return myDataList;
	}
	
	public static void main(String[] args) throws Exception {
		File file = new File("e:/a.xlsx");
		System.out.println(new MyExcel2007ForPaging_high(file.getPath(), 1, 50).getMyDataList());
	}
}