我正在寻找一种最有效的方式来读取文本文件。
考虑所有可能的优势,例如以下优势:
代码将是特定于平台的Windows OS
而且事实上我正在为当前的CPU等编写特定的代码。
*不要介意这不是多平台。
只是普通的性能问题
我如何以最快的方式编写代码,将文本文件的每一行读入一个结构中?
说结构是:
typdef struct _FileL{ uint lidx; char* lncontent; } FileL;
我在想类似的东西:
通过FileL上方的动态数组和文件的路径,什么是填充并返回给定文件的行集合的最有效方法?
FileL
getFileLines(char* fullPath, FileL** fileLines){ uint linesCount = 0;// total lines uint curLnIndex = 0;// lines counter FILE* srcFL; // will hold the source file using passed fullPath // now read file into memory //that is the only way i could think of //to be able to assign lineCount used to calculate the array length //and also the fastest way is to read the whole file into memory (?) srcFL = fopen(fullPath,"r"); // or open() instead of fopen() ? // problem: // assigning lineCount somehow would cost the time of reading // and might aswell just populate it and already store to array.. // so what is the workaround ? *fileLines =(FileL*) malloc (linesCount * sizeof(FileL)); FileL* CurLn = *fileLines; while(curLnIndex != linesCount){ char* TmpLnCont; //read line into TmpLnCont.. CurLn->lidx = curLnIndex ; CurLn->lncontent = (char*)malloc(sizeof(TmpLnCont)); strcpy(CurLn->lncontent, TmpLnCont); } }
怎么会更有效呢?
编辑:
我想更新这个问题,因为我想解决效率问题:最有效的方法是逐行扫描文件的细节,这是逐字符的唯一方法,如何避免调用函数每个字符…最大化每行循环性能的任何事情,这意味着有效地获取行的定义…
read(2)由于行的碎片,这样做的效果不是很好(例如,如果您读取1000个字符,则最后一行可能从偏移量990开始,并且需要超出缓冲区50个字符)。
read(2)
最好使用fgets[推荐]或mmap[YMMV]。以下是每个示例。警告:编译,但未经测试,并且不执行太多错误检查
fgets
mmap
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <fcntl.h> #include <unistd.h> #include <sys/stat.h> #include <sys/mman.h> typedef struct { unsigned int lidx; char *lncontent; } FileL; // read in file lines using fgets FileL * getfileline_fgets(char *file) { FILE *xf; char *cp; int len; int linecnt; int linemax; FileL *linelist; FileL *line; char lbuf[50000]; xf = fopen(file,"r"); linecnt = 0; linemax = 0; linelist = NULL; while (1) { cp = fgets(lbuf,sizeof(lbuf),xf); if (cp == NULL) break; len = strlen(lbuf); // strip newline from string [or not] #if 1 if (len > 0) { cp = &lbuf[len - 1]; if (*cp == '\n') { *cp = 0; --len; } } #endif if ((linecnt + 1) > linemax) { linemax += 100; linelist = realloc(linelist,linemax * sizeof(FileL)); } line = &linelist[linecnt]; line->lidx = linecnt++; cp = malloc(len + 1); memcpy(cp,lbuf,len + 1); line->lncontent = cp; } fclose(xf); // trim to maximum used linelist = realloc(linelist,linecnt * sizeof(FileL)); return linelist; } // read in file lines by mmap to entire file FileL * getfilelines_mmap(char *file) { int fd; char *lhs; char *rhs; char *cp; int len; int linecnt; int linemax; FileL *linelist; FileL *line; struct stat st; char *fbuf; char cbuf[50000]; fd = open(file,O_RDONLY); fstat(fd,&st); fbuf = mmap(NULL,st.st_size,PROT_READ,MAP_PRIVATE,fd,0); linecnt = 0; linemax = 0; linelist = NULL; lhs = fbuf; rhs = fbuf; for (lhs = fbuf; lhs < &fbuf[st.st_size]; lhs = rhs + 1) { rhs = strchr(lhs,'\n'); // NOTE: does _not_ handle case of malformed text file that has _no_ // newline on last line if (rhs == NULL) break; len = rhs - lhs; // strip newline from string [or not] #if 1 if (len > 0) --len; #endif if ((linecnt + 1) > linemax) { linemax += 100; linelist = realloc(linelist,linemax * sizeof(FileL)); } line = &linelist[linecnt]; line->lidx = linecnt++; cp = malloc(len + 1); memcpy(cp,lhs,len); cp[len] = 0; line->lncontent = cp; } munmap(fbuf,st.st_size); close(fd); // trim to maximum used linelist = realloc(linelist,linecnt * sizeof(FileL)); return linelist; }
更新
您想要基准。好吧,你会得到他们的。生成167GB的随机文本数据,涵盖140个文件。的L:是线#,W:是最大宽度,并示出了MB文件大小。时间以纳秒为单位。请注意,因素各不相同,但看起来像mmap获胜。
L:
W:
23:39:35.528333425 NEWDAY 11/09/15 23:39:35.528333425 ph: starting 23107 ... 23:39:35.528868198 ph: ARGV fastreadgo ... F001: L:324255 W:2097 324.086MB 368297556 fgets 189180143 mmap F002: L:329608 W:2822 443.649MB 475989122 fgets 248517335 mmap F003: L:401476 W:6186 1185.270MB 1206999411 fgets 657703847 mmap F004: L:729379 W:9350 3253.185MB 3199692871 fgets 1776602082 mmap F005: L:85857 W:5185 212.599MB 223489564 fgets 122404608 mmap F006: L:62871 W:5418 162.384MB 167640768 fgets 93127042 mmap F007: L:298836 W:1083 154.481MB 196584474 fgets 100582134 mmap F008: L:221513 W:2732 288.694MB 322105867 fgets 164965547 mmap F009: L:420815 W:8906 1789.672MB 1801309998 fgets 961136893 mmap F010: L:126712 W:8251 498.905MB 499274233 fgets 275901635 mmap F011: L:443166 W:8822 1865.753MB 1839816883 fgets 1001882651 mmap F012: L:385632 W:2162 398.467MB 467223648 fgets 248126909 mmap F013: L:629448 W:4413 1324.616MB 1432284339 fgets 777593198 mmap F014: L:510357 W:7313 1779.348MB 1919309671 fgets 1079111734 mmap F015: L:188434 W:1254 112.922MB 152367682 fgets 78959769 mmap F016: L:82139 W:4355 170.586MB 193117015 fgets 105417805 mmap F017: L:389499 W:9063 1681.805MB 1730894028 fgets 913789253 mmap F018: L:992849 W:3265 1547.875MB 1685006767 fgets 875256226 mmap F019: L:931502 W:9647 4285.883MB 11181005402 fgets 2361255543 mmap F020: L:266047 W:7454 946.298MB 955772708 fgets 537059554 mmap F021: L:572709 W:67 18.835MB 86539501 fgets 43437303 mmap F022: L:68373 W:3042 98.684MB 110325296 fgets 57538963 mmap F023: L:651839 W:2006 624.153MB 706094723 fgets 369122560 mmap F024: L:414658 W:6482 1284.202MB 1294352248 fgets 700279769 mmap F025: L:984554 W:3441 1616.269MB 1742233370 fgets 903755131 mmap F026: L:527629 W:3214 808.812MB 872660092 fgets 465403685 mmap F027: L:572103 W:6219 1696.582MB 1758562312 fgets 933024466 mmap F028: L:793354 W:5967 2255.653MB 2341754885 fgets 1251633414 mmap F029: L:690669 W:389 128.888MB 230036016 fgets 119381427 mmap F030: L:902519 W:8182 3523.415MB 6665490426 fgets 1930049511 mmap F031: L:179482 W:2361 201.850MB 225333697 fgets 120424715 mmap F032: L:342396 W:4135 675.885MB 706219203 fgets 379974402 mmap F033: L:762237 W:4000 1455.780MB 1535236381 fgets 805977762 mmap F034: L:421947 W:8289 1669.038MB 1686877811 fgets 900813641 mmap F035: L:367349 W:5829 1022.373MB 1051584165 fgets 566680706 mmap F036: L:433973 W:5064 1049.724MB 1097920811 fgets 584855289 mmap F037: L:615918 W:9152 2686.372MB 2743719787 fgets 1468536802 mmap F038: L:365187 W:1564 272.829MB 326368364 fgets 171071840 mmap F039: L:61305 W:477 14.002MB 22945438 fgets 11949833 mmap F040: L:396788 W:8576 1622.049MB 1633217001 fgets 884460205 mmap F041: L:245326 W:5068 592.450MB 610530077 fgets 328366102 mmap F042: L:986409 W:9174 4313.608MB 17048484450 fgets 2413375121 mmap F043: L:367968 W:9703 1703.785MB 1677764299 fgets 922735827 mmap F044: L:630679 W:9763 2942.911MB 4742195305 fgets 1585438052 mmap F045: L:397072 W:7717 1459.554MB 1533634531 fgets 860518182 mmap F046: L:918129 W:9127 3996.179MB 10259712214 fgets 2171550789 mmap F047: L:770706 W:2720 999.584MB 1097599308 fgets 604894013 mmap F048: L:472462 W:5011 1127.896MB 1164186449 fgets 621979909 mmap F049: L:301834 W:4456 642.703MB 664420452 fgets 354255131 mmap F050: L:213878 W:2913 297.159MB 321396955 fgets 168664579 mmap F051: L:549950 W:1681 441.842MB 510553173 fgets 260455948 mmap F052: L:63502 W:8785 267.074MB 265697002 fgets 142457939 mmap F053: L:880396 W:6821 2864.595MB 3769485430 fgets 1591318886 mmap F054: L:180543 W:9055 779.566MB 773462618 fgets 428627500 mmap F055: L:964409 W:8454 3884.437MB 9085108760 fgets 2149540695 mmap F056: L:675120 W:8912 2872.781MB 2885159527 fgets 1559580604 mmap F057: L:345151 W:4157 684.052MB 724456228 fgets 387170980 mmap F058: L:69114 W:4585 150.535MB 157447952 fgets 84782951 mmap F059: L:304627 W:9441 1370.777MB 1376517664 fgets 739170571 mmap F060: L:799770 W:3145 1200.762MB 1304001986 fgets 679163462 mmap F061: L:808699 W:6544 2523.949MB 2590924710 fgets 1385627164 mmap F062: L:270082 W:313 40.592MB 78777863 fgets 40733146 mmap F063: L:308883 W:333 49.262MB 93696361 fgets 48580067 mmap F064: L:237002 W:2618 296.446MB 347315129 fgets 178078149 mmap F065: L:279040 W:1341 178.685MB 217230537 fgets 113291912 mmap F066: L:809386 W:2808 1085.734MB 1177480987 fgets 615248653 mmap F067: L:279448 W:8560 1140.280MB 1151044788 fgets 614662533 mmap F068: L:80012 W:7441 283.334MB 286915203 fgets 158077955 mmap F069: L:366808 W:7197 1260.521MB 1292679736 fgets 696686301 mmap F070: L:272693 W:9275 1206.527MB 1220763889 fgets 658383413 mmap F071: L:792609 W:1419 537.088MB 645760162 fgets 334886975 mmap F072: L:742523 W:8640 3059.604MB 5711688133 fgets 1665879727 mmap F073: L:583753 W:2992 833.759MB 910037328 fgets 483847376 mmap F074: L:252560 W:7178 864.593MB 868625985 fgets 471770777 mmap F075: L:154327 W:7026 515.619MB 516135586 fgets 277690063 mmap F076: L:121839 W:7131 414.684MB 424518600 fgets 230518357 mmap F077: L:760327 W:1421 515.475MB 622630358 fgets 314592959 mmap F078: L:907033 W:3485 1508.042MB 1622356297 fgets 845695719 mmap F079: L:884787 W:7491 3162.774MB 4932864122 fgets 1749065509 mmap F080: L:432556 W:6039 1245.779MB 1281231973 fgets 693532807 mmap F081: L:639804 W:6419 1957.747MB 2107303517 fgets 1130299002 mmap F082: L:388669 W:283 52.804MB 111686630 fgets 57177517 mmap F083: L:300542 W:1943 278.825MB 336538347 fgets 177494803 mmap F084: L:941 W:7,3878 1.770MB 2047540 fgets 1347230 mmap F085: L:85747 W:1841 75.417MB 92274672 fgets 49362653 mmap F086: L:935559 W:5950 2656.411MB 2734326131 fgets 1487147776 mmap F087: L:936993 W:1197 535.727MB 672872562 fgets 348765250 mmap F088: L:409671 W:5235 1023.358MB 1099320520 fgets 606047909 mmap F089: L:362220 W:5434 938.805MB 991448256 fgets 529093412 mmap F090: L:628156 W:3682 1103.909MB 1185317812 fgets 637902310 mmap F091: L:655456 W:6051 1892.574MB 1978859918 fgets 1066368241 mmap F092: L:356309 W:5946 1012.893MB 1046818030 fgets 562463577 mmap F093: L:878726 W:2946 1236.162MB 1368885560 fgets 701514499 mmap F094: L:583863 W:747 208.701MB 293177923 fgets 148045230 mmap F095: L:51374 W:3752 91.670MB 98830853 fgets 52715699 mmap F096: L:757271 W:4698 1698.664MB 1790811621 fgets 946452098 mmap F097: L:665420 W:1814 575.369MB 664290848 fgets 347346293 mmap F098: L:152806 W:4480 326.336MB 338683910 fgets 185037896 mmap F099: L:39027 W:2368 44.104MB 49144948 fgets 26701307 mmap F100: L:896926 W:8209 3513.328MB 7460727008 fgets 1900543480 mmap F101: L:796628 W:5663 2149.888MB 2207899454 fgets 1187397751 mmap F102: L:876500 W:1986 831.161MB 934850175 fgets 486626065 mmap F103: L:188682 W:773 69.722MB 97285527 fgets 48765985 mmap F104: L:648920 W:9590 2969.446MB 5021968784 fgets 1622268239 mmap F105: L:827850 W:2123 837.892MB 946063144 fgets 498163978 mmap F106: L:879828 W:2867 1205.021MB 1304295176 fgets 682155187 mmap F107: L:970674 W:3830 1771.667MB 1883664162 fgets 989569477 mmap F108: L:4461 W:5634 11.840MB 12680659 fgets 7159011 mmap F109: L:477207 W:1067 243.224MB 315370392 fgets 162708299 mmap F110: L:140308 W:5817 389.132MB 397510757 fgets 216204659 mmap F111: L:253358 W:4425 534.937MB 559943651 fgets 297109524 mmap F112: L:903292 W:7989 3441.851MB 7327033977 fgets 1906200470 mmap F113: L:555989 W:620 164.835MB 245638038 fgets 126559933 mmap F114: L:596425 W:2330 664.143MB 739017073 fgets 391237002 mmap F115: L:298147 W:9741 1387.530MB 1363229979 fgets 744420477 mmap F116: L:180269 W:4522 389.175MB 402702977 fgets 213875684 mmap F117: L:238597 W:9021 1029.314MB 1033070395 fgets 550442036 mmap F118: L:183723 W:8705 764.555MB 765959712 fgets 413667801 mmap F119: L:174802 W:549 45.896MB 70635625 fgets 35721310 mmap F120: L:883013 W:4666 1963.677MB 2062197751 fgets 1092583730 mmap F121: L:858995 W:9218 3776.896MB 9278222413 fgets 2309240152 mmap F122: L:368895 W:5862 1030.174MB 1076473726 fgets 582127460 mmap F123: L:208043 W:5672 563.889MB 579427255 fgets 310321934 mmap F124: L:768482 W:4953 1816.657MB 1888233155 fgets 997797932 mmap F125: L:905425 W:2812 1214.882MB 1394928053 fgets 724059403 mmap F126: L:54137 W:4690 121.066MB 125124760 fgets 67811537 mmap F127: L:448100 W:9643 2061.624MB 2066282543 fgets 1126488038 mmap F128: L:748979 W:2111 754.038MB 854095589 fgets 447406977 mmap F129: L:611388 W:6954 2026.306MB 2074219917 fgets 1118353849 mmap F130: L:782834 W:9946 3715.067MB 7338500374 fgets 2029571615 mmap F131: L:52630 W:7858 197.495MB 200711062 fgets 110759659 mmap F132: L:930983 W:7363 3270.546MB 3376813502 fgets 1776365395 mmap F133: L:73216 W:2127 74.344MB 85854537 fgets 46756335 mmap F134: L:583306 W:2495 694.192MB 766430638 fgets 408095226 mmap F135: L:877424 W:2964 1241.342MB 1339005805 fgets 702659289 mmap F136: L:414854 W:5104 1010.006MB 1057372341 fgets 556583887 mmap F137: L:333176 W:4912 781.109MB 820007572 fgets 435433956 mmap F138: L:564006 W:6933 1863.574MB 1905024687 fgets 1030574213 mmap F139: L:829571 W:9152 3622.399MB 7338698902 fgets 2002428493 mmap F140: L:560210 W:7443 1990.047MB 2012670010 fgets 1098720143 mmap 00:00:58.770988225 NEWDAY 11/10/15 00:00:58.770988225 ph: complete (ELAPSED: 00:21:23.190149545)
这是我用来生成文件的perl脚本:
#!/usr/bin/perl # grpcntgen -- generate test data for fastread algorithms # # arguments: # "-W" - maximum line width # "-L" - maximum number of lines # "-T" - number of test files to generate # "-O" - output file (e.g. foo%.txt) # # NOTE: with no arguments or missing arguments will prompt #pragma pgmlns # tstgen -- test generation help routines # gengetstr -- get a string/number sub gengetstr { my($numflg,$opt,$prompt,$lim) = @_; my($arg); my($askflg); my($val); select(STDOUT); $| = 1; { # search command line for -whatever foreach $arg (@argv) { if ($arg =~ /^$opt(.*)$/) { $val = $1; if ($numflg && ($val eq "")) { $val = $lim; $val //= 1; } last; } } last if (defined($val)); $askflg = 1; while (1) { printf("Enter ") if ($numflg != 1); printf("%s",$prompt); if ($numflg == 1) { printf(" (0/1)? "); } else { printf(": "); } $val = <STDIN>; chomp($val); if ($numflg == 0) { last if ($val ne ""); next; } # an empty response for a number with a maximum means use it if (($numflg == 2) && ($val eq "") && defined($lim)) { $val = $lim; last; } next unless ($val =~ /^\d+$/); $val += 0; last if ($numflg == 1); next if ($val <= 0); last unless (defined($lim)); last if ($val <= $lim); } } unless ($askflg) { printf("%s: %s\n",$prompt,$val); } $val; } # genrun -- generate all tests sub genrun { local(@argv) = @_; local($ofile,$tstmax,$tstproc); local($tstcur); local($splitflg); local($genvbq); my($sym); my($numfmt); my($xfile); $genvbq = genvbq(\@argv); $ofile = shift(@argv); $tstmax = shift(@argv); $tstproc = shift(@argv); # split each test into separate file if ($ofile =~ /%/) { $splitflg = 1; $numfmt = sprintf("%d",$tstmax); $numfmt = length($numfmt); $numfmt = sprintf("_%%%d.%dd",$numfmt,$numfmt); $ofile =~ s/%/$numfmt/; ###die("genrun: DEBUG_CAE numfmt='$numfmt' ofile='$ofile'\n"); } { last if ($splitflg); genopen($ofile); } for ($tstcur = 1; $tstcur <= $tstmax; ++$tstcur) { { last unless ($splitflg); $xfile = sprintf($ofile,$tstcur); genopen($xfile); } &$tstproc(); { last unless ($splitflg); genclose(); } } { last if ($splitflg); genclose(); } } # genvbq -- get options sub genvbq { my($argv) = @_; my($sym); my($env); $env = {}; while (1) { $sym = $argv->[0]; last unless ($sym =~ s/^-//); shift(@$argv); if ($sym =~ /^([^=]+)=(.+)$/) { ($sym,$val) = ($1,$2); } else { $val = 1; } $env->{$sym} = $val; } $env; } # genopen -- open output sub genopen { my($ofile) = @_; $gen_ofile = $ofile; { last if ($genvbq->{"n"}); last if (open($xfdst,">$ofile")); die("genopen: unable to open '$ofile' -- $!\n"); } } # genclose -- close output sub genclose { close($xfdst); } # geninit -- initialize for single test sub geninit { undef($genout_lhs); undef($genout_pre); } # genout -- output data sub genout { my($rhs) = @_; { if (defined($rhs)) { last if ((length($genout_pre) + length($genout_lhs) + length($rhs)) < 78); } last if ($genout_lhs eq ""); print($xfdst $genout_pre,$genout_lhs,"\n"); undef($genout_lhs); } $genout_lhs .= $rhs if (defined($rhs)); } # genrand -- get random number sub genrand { my($lim) = @_; my($val); $val = int(rand($lim)); $val += 1; $val; } # genfmtof -- get number format sub genfmtof { my($num) = @_; my($fmt); $fmt = sprintf("%d",$num); $fmt = length($fmt); $fmt = sprintf(" %%%dd",$fmt); $fmt; } 1; master(@ARGV); exit(0); # master -- master control sub master { local(@argv) = @_; $Wmax = gengetstr(2,"-W","maximum line width"); $Lmax = gengetstr(2,"-L","maximum number of lines / file"); $tstmax = gengetstr(2,"-T","number of tests"); while (1) { $ofile = gengetstr(0,"-O","output file name"); last if ($ofile =~ /%/); printf("fastreadgen: filename must have %% in it (e.g. foo%%.txt)\n"); } genrun($ofile,$tstmax,\&dotest); } # dotest -- generate a test sub dotest { my($lno,$chroff); my($lhs); $Wlim = genrand($Wmax); $Llim = genrand($Lmax); printf("dotest: T=%d/%d W=%d L=%d\n",$tstcur,$tstmax,$Wlim,$Llim); for ($lno = 1; $lno <= $Llim; ++$lno) { $Wcur = genrand($Wlim); $Wcur -= 1; undef($lhs); for (; $Wcur > 0; --$Wcur) { $chroff = genrand(0x7E - 0x20); $chroff += 0x20; $chroff -= 1; $chroff = chr($chroff); last; $lhs .= $chroff; } $lhs = $chroff x $Wcur; print($xfdst $lhs,"\n"); } }