我有一个150MB的单张excel文件,使用以下命令在功能非常强大的计算机上打开大约需要7分钟:
# using python import xlrd wb = xlrd.open_workbook(file) sh = wb.sheet_by_index(0)
有什么办法可以更快地打开excel文件吗?我愿意接受甚至非常古怪的建议(例如hadoop,spark,c,java等)。理想情况下,如果这不是白日梦,我正在寻找一种在30秒内打开文件的方法。另外,上面的示例使用的是python,但不一定必须是python。
注意:这是来自客户端的Excel文件。 收到它之前,不能将其转换成任何其他格式。这不是我们的档案
更新:给出 一个有效的代码示例,它将在30秒内打开以下200MB excel文件的答案将得到赏金奖励:https : //drive.google.com/file/d/0B_CXvCTOo7_2VW9id2VXRWZrbzQ/view? usp =sharing。该文件应具有字符串(第1列),日期(第9列)和数字(第11列)。
好吧,如果您的excel像示例一样简单到一个CSV文件(https://drive.google.com/file/d/0B_CXvCTOo7_2UVZxbnpRaEVnaFk/view?usp=sharing),则可以尝试按以下方式打开文件一个zip文件并直接读取每个xml:
Intel i5 4460、12 GB RAM,SSD三星EVO PRO
如果您有很多内存RAM: 此代码需要大量RAM,但需要20到25秒。(您需要参数-Xmx7g)
package com.devsaki.opensimpleexcel; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.nio.charset.Charset; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.zip.ZipFile; public class Multithread { public static final char CHAR_END = (char) -1; public static void main(String[] args) throws IOException, ExecutionException, InterruptedException { String excelFile = "C:/Downloads/BigSpreadsheetAllTypes.xlsx"; ZipFile zipFile = new ZipFile(excelFile); long init = System.currentTimeMillis(); ExecutorService executor = Executors.newFixedThreadPool(4); char[] sheet1 = readEntry(zipFile, "xl/worksheets/sheet1.xml").toCharArray(); Future<Object[][]> futureSheet1 = executor.submit(() -> processSheet1(new CharReader(sheet1), executor)); char[] sharedString = readEntry(zipFile, "xl/sharedStrings.xml").toCharArray(); Future<String[]> futureWords = executor.submit(() -> processSharedStrings(new CharReader(sharedString))); Object[][] sheet = futureSheet1.get(); String[] words = futureWords.get(); executor.shutdown(); long end = System.currentTimeMillis(); System.out.println("only read: " + (end - init) / 1000); ///Doing somethin with the file::Saving as csv init = System.currentTimeMillis(); try (PrintWriter writer = new PrintWriter(excelFile + ".csv", "UTF-8");) { for (Object[] rows : sheet) { for (Object cell : rows) { if (cell != null) { if (cell instanceof Integer) { writer.append(words[(Integer) cell]); } else if (cell instanceof String) { writer.append(toDate(Double.parseDouble(cell.toString()))); } else { writer.append(cell.toString()); //Probably a number } } writer.append(";"); } writer.append("\n"); } } end = System.currentTimeMillis(); System.out.println("Main saving to csv: " + (end - init) / 1000); } private static final DateTimeFormatter formatter = DateTimeFormatter.ISO_DATE_TIME; private static final LocalDateTime INIT_DATE = LocalDateTime.parse("1900-01-01T00:00:00+00:00", formatter).plusDays(-2); //The number in excel is from 1900-jan-1, so every number time that you get, you have to sum to that date public static String toDate(double s) { return formatter.format(INIT_DATE.plusSeconds((long) ((s*24*3600)))); } public static String readEntry(ZipFile zipFile, String entry) throws IOException { System.out.println("Initialing readEntry " + entry); long init = System.currentTimeMillis(); String result = null; try (BufferedReader br = new BufferedReader(new InputStreamReader(zipFile.getInputStream(zipFile.getEntry(entry)), Charset.forName("UTF-8")))) { br.readLine(); result = br.readLine(); } long end = System.currentTimeMillis(); System.out.println("readEntry '" + entry + "': " + (end - init) / 1000); return result; } public static String[] processSharedStrings(CharReader br) throws IOException { System.out.println("Initialing processSharedStrings"); long init = System.currentTimeMillis(); String[] words = null; char[] wordCount = "Count=\"".toCharArray(); char[] token = "<t>".toCharArray(); String uniqueCount = extractNextValue(br, wordCount, '"'); words = new String[Integer.parseInt(uniqueCount)]; String nextWord; int currentIndex = 0; while ((nextWord = extractNextValue(br, token, '<')) != null) { words[currentIndex++] = nextWord; br.skip(11); //you can skip at least 11 chars "/t></si><si>" } long end = System.currentTimeMillis(); System.out.println("SharedStrings: " + (end - init) / 1000); return words; } public static Object[][] processSheet1(CharReader br, ExecutorService executorService) throws IOException, ExecutionException, InterruptedException { System.out.println("Initialing processSheet1"); long init = System.currentTimeMillis(); char[] dimensionToken = "dimension ref=\"".toCharArray(); String dimension = extractNextValue(br, dimensionToken, '"'); int[] sizes = extractSizeFromDimention(dimension.split(":")[1]); br.skip(30); //Between dimension and next tag c exists more or less 30 chars Object[][] result = new Object[sizes[0]][sizes[1]]; int parallelProcess = 8; int currentIndex = br.currentIndex; CharReader[] charReaders = new CharReader[parallelProcess]; int totalChars = Math.round(br.chars.length / parallelProcess); for (int i = 0; i < parallelProcess; i++) { int endIndex = currentIndex + totalChars; charReaders[i] = new CharReader(br.chars, currentIndex, endIndex, i); currentIndex = endIndex; } Future[] futures = new Future[parallelProcess]; for (int i = charReaders.length - 1; i >= 0; i--) { final int j = i; futures[i] = executorService.submit(() -> inParallelProcess(charReaders[j], j == 0 ? null : charReaders[j - 1], result)); } for (Future future : futures) { future.get(); } long end = System.currentTimeMillis(); System.out.println("Sheet1: " + (end - init) / 1000); return result; } public static void inParallelProcess(CharReader br, CharReader back, Object[][] result) { System.out.println("Initialing inParallelProcess : " + br.identifier); char[] tokenOpenC = "<c r=\"".toCharArray(); char[] tokenOpenV = "<v>".toCharArray(); char[] tokenAttributS = " s=\"".toCharArray(); char[] tokenAttributT = " t=\"".toCharArray(); String v; int firstCurrentIndex = br.currentIndex; boolean first = true; while ((v = extractNextValue(br, tokenOpenC, '"')) != null) { if (first && back != null) { int sum = br.currentIndex - firstCurrentIndex - tokenOpenC.length - v.length() - 1; first = false; System.out.println("Adding to : " + back.identifier + " From : " + br.identifier); back.plusLength(sum); } int[] indexes = extractSizeFromDimention(v); int s = foundNextTokens(br, '>', tokenAttributS, tokenAttributT); char type = 's'; //3 types: number (n), string (s) and date (d) if (s == 0) { // Token S = number or date char read = br.read(); if (read == '1') { type = 'n'; } else { type = 'd'; } } else if (s == -1) { type = 'n'; } String c = extractNextValue(br, tokenOpenV, '<'); Object value = null; switch (type) { case 'n': value = Double.parseDouble(c); break; case 's': try { value = Integer.parseInt(c); } catch (Exception ex) { System.out.println("Identifier Error : " + br.identifier); } break; case 'd': value = c.toString(); break; } result[indexes[0] - 1][indexes[1] - 1] = value; br.skip(7); ///v></c> } } static class CharReader { char[] chars; int currentIndex; int length; int identifier; public CharReader(char[] chars) { this.chars = chars; this.length = chars.length; } public CharReader(char[] chars, int currentIndex, int length, int identifier) { this.chars = chars; this.currentIndex = currentIndex; if (length > chars.length) { this.length = chars.length; } else { this.length = length; } this.identifier = identifier; } public void plusLength(int n) { if (this.length + n <= chars.length) { this.length += n; } } public char read() { if (currentIndex >= length) { return CHAR_END; } return chars[currentIndex++]; } public void skip(int n) { currentIndex += n; } } public static int[] extractSizeFromDimention(String dimention) { StringBuilder sb = new StringBuilder(); int columns = 0; int rows = 0; for (char c : dimention.toCharArray()) { if (columns == 0) { if (Character.isDigit(c)) { columns = convertExcelIndex(sb.toString()); sb = new StringBuilder(); } } sb.append(c); } rows = Integer.parseInt(sb.toString()); return new int[]{rows, columns}; } public static int foundNextTokens(CharReader br, char until, char[]... tokens) { char character; int[] indexes = new int[tokens.length]; while ((character = br.read()) != CHAR_END) { if (character == until) { break; } for (int i = 0; i < indexes.length; i++) { if (tokens[i][indexes[i]] == character) { indexes[i]++; if (indexes[i] == tokens[i].length) { return i; } } else { indexes[i] = 0; } } } return -1; } public static String extractNextValue(CharReader br, char[] token, char until) { char character; StringBuilder sb = new StringBuilder(); int index = 0; while ((character = br.read()) != CHAR_END) { if (index == token.length) { if (character == until) { return sb.toString(); } else { sb.append(character); } } else { if (token[index] == character) { index++; } else { index = 0; } } } return null; } public static int convertExcelIndex(String index) { int result = 0; for (char c : index.toCharArray()) { result = result * 26 + ((int) c - (int) 'A' + 1); } return result; } }
旧答案(不需要参数Xms7g,因此需要较少的内存): 使用HDD打开并读取示例文件大约需要35秒(200MB),而使用SDD花费的时间要少一些(30秒)。
这里的代码:https : //github.com/csaki/OpenSimpleExcelFast.git
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.nio.charset.Charset; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.zip.ZipFile; public class Launcher { public static final char CHAR_END = (char) -1; public static void main(String[] args) throws IOException, ExecutionException, InterruptedException { long init = System.currentTimeMillis(); String excelFile = "D:/Downloads/BigSpreadsheet.xlsx"; ZipFile zipFile = new ZipFile(excelFile); ExecutorService executor = Executors.newFixedThreadPool(4); Future<String[]> futureWords = executor.submit(() -> processSharedStrings(zipFile)); Future<Object[][]> futureSheet1 = executor.submit(() -> processSheet1(zipFile)); String[] words = futureWords.get(); Object[][] sheet1 = futureSheet1.get(); executor.shutdown(); long end = System.currentTimeMillis(); System.out.println("Main only open and read: " + (end - init) / 1000); ///Doing somethin with the file::Saving as csv init = System.currentTimeMillis(); try (PrintWriter writer = new PrintWriter(excelFile + ".csv", "UTF-8");) { for (Object[] rows : sheet1) { for (Object cell : rows) { if (cell != null) { if (cell instanceof Integer) { writer.append(words[(Integer) cell]); } else if (cell instanceof String) { writer.append(toDate(Double.parseDouble(cell.toString()))); } else { writer.append(cell.toString()); //Probably a number } } writer.append(";"); } writer.append("\n"); } } end = System.currentTimeMillis(); System.out.println("Main saving to csv: " + (end - init) / 1000); } private static final DateTimeFormatter formatter = DateTimeFormatter.ISO_DATE_TIME; private static final LocalDateTime INIT_DATE = LocalDateTime.parse("1900-01-01T00:00:00+00:00", formatter).plusDays(-2); //The number in excel is from 1900-jan-1, so every number time that you get, you have to sum to that date public static String toDate(double s) { return formatter.format(INIT_DATE.plusSeconds((long) ((s*24*3600)))); } public static Object[][] processSheet1(ZipFile zipFile) throws IOException { String entry = "xl/worksheets/sheet1.xml"; Object[][] result = null; char[] dimensionToken = "dimension ref=\"".toCharArray(); char[] tokenOpenC = "<c r=\"".toCharArray(); char[] tokenOpenV = "<v>".toCharArray(); char[] tokenAttributS = " s=\"".toCharArray(); char[] tokenAttributT = " t=\"".toCharArray(); try (BufferedReader br = new BufferedReader(new InputStreamReader(zipFile.getInputStream(zipFile.getEntry(entry)), Charset.forName("UTF-8")))) { String dimension = extractNextValue(br, dimensionToken, '"'); int[] sizes = extractSizeFromDimention(dimension.split(":")[1]); br.skip(30); //Between dimension and next tag c exists more or less 30 chars result = new Object[sizes[0]][sizes[1]]; String v; while ((v = extractNextValue(br, tokenOpenC, '"')) != null) { int[] indexes = extractSizeFromDimention(v); int s = foundNextTokens(br, '>', tokenAttributS, tokenAttributT); char type = 's'; //3 types: number (n), string (s) and date (d) if (s == 0) { // Token S = number or date char read = (char) br.read(); if (read == '1') { type = 'n'; } else { type = 'd'; } } else if (s == -1) { type = 'n'; } String c = extractNextValue(br, tokenOpenV, '<'); Object value = null; switch (type) { case 'n': value = Double.parseDouble(c); break; case 's': value = Integer.parseInt(c); break; case 'd': value = c.toString(); break; } result[indexes[0] - 1][indexes[1] - 1] = value; br.skip(7); ///v></c> } } return result; } public static int[] extractSizeFromDimention(String dimention) { StringBuilder sb = new StringBuilder(); int columns = 0; int rows = 0; for (char c : dimention.toCharArray()) { if (columns == 0) { if (Character.isDigit(c)) { columns = convertExcelIndex(sb.toString()); sb = new StringBuilder(); } } sb.append(c); } rows = Integer.parseInt(sb.toString()); return new int[]{rows, columns}; } public static String[] processSharedStrings(ZipFile zipFile) throws IOException { String entry = "xl/sharedStrings.xml"; String[] words = null; char[] wordCount = "Count=\"".toCharArray(); char[] token = "<t>".toCharArray(); try (BufferedReader br = new BufferedReader(new InputStreamReader(zipFile.getInputStream(zipFile.getEntry(entry)), Charset.forName("UTF-8")))) { String uniqueCount = extractNextValue(br, wordCount, '"'); words = new String[Integer.parseInt(uniqueCount)]; String nextWord; int currentIndex = 0; while ((nextWord = extractNextValue(br, token, '<')) != null) { words[currentIndex++] = nextWord; br.skip(11); //you can skip at least 11 chars "/t></si><si>" } } return words; } public static int foundNextTokens(BufferedReader br, char until, char[]... tokens) throws IOException { char character; int[] indexes = new int[tokens.length]; while ((character = (char) br.read()) != CHAR_END) { if (character == until) { break; } for (int i = 0; i < indexes.length; i++) { if (tokens[i][indexes[i]] == character) { indexes[i]++; if (indexes[i] == tokens[i].length) { return i; } } else { indexes[i] = 0; } } } return -1; } public static String extractNextValue(BufferedReader br, char[] token, char until) throws IOException { char character; StringBuilder sb = new StringBuilder(); int index = 0; while ((character = (char) br.read()) != CHAR_END) { if (index == token.length) { if (character == until) { return sb.toString(); } else { sb.append(character); } } else { if (token[index] == character) { index++; } else { index = 0; } } } return null; } public static int convertExcelIndex(String index) { int result = 0; for (char c : index.toCharArray()) { result = result * 26 + ((int) c - (int) 'A' + 1); } return result; } }