From f9556dca51484c270284c181337fc041964f2db0 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 1 Jun 2018 10:59:42 +0800 Subject: [PATCH] use open_files reader to read multiple files --- .../howto/cluster/fluid_recordio.md} | 26 ++++++++++-------- .../reader/create_recordio_file_reader_op.cc | 12 ++++---- python/paddle/fluid/layers/io.py | 20 ++++++-------- python/paddle/fluid/recordio_writer.py | 12 ++++++-- tools/codestyle/docstring_checker.pyc | Bin 0 -> 11769 bytes 5 files changed, 37 insertions(+), 33 deletions(-) rename doc/{v2/howto/recordio/README.md => fluid/howto/cluster/fluid_recordio.md} (88%) create mode 100644 tools/codestyle/docstring_checker.pyc diff --git a/doc/v2/howto/recordio/README.md b/doc/fluid/howto/cluster/fluid_recordio.md similarity index 88% rename from doc/v2/howto/recordio/README.md rename to doc/fluid/howto/cluster/fluid_recordio.md index 3f81d54b8e1..0e8b98542d1 100644 --- a/doc/v2/howto/recordio/README.md +++ b/doc/fluid/howto/cluster/fluid_recordio.md @@ -89,14 +89,14 @@ The above codes would generate multiple RecordIO files on your host like: ```bash . - \_mnist.recordio-00000 - |-mnist.recordio-00001 - |-mnist.recordio-00002 - |-mnist.recordio-00003 - |-mnist.recordio-00004 + \_mnist-00000.recordio + |-mnist-00001.recordio + |-mnist-00002.recordio + |-mnist-00003.recordio + |-mnist-00004.recordio ``` -1. read these RecordIO files with `fluid.layers.io.open_recordio_file` +1. open multiple RecordIO files by `fluid.layers.io.open_files` For a distributed training job, the distributed operator system will schedule trainer process on multiple nodes, each trainer process reads parts of the whole training data, we usually take the following approach to make the training @@ -113,10 +113,12 @@ def gen_train_list(file_pattern, trainers, trainer_id): trainers = int(os.getenv("TRAINERS")) trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) -data_file = fluid.layers.io.open_recordio_file( - filename=gen_train_list("./mnist.recordio*", trainers, trainer_id), - shapes=[(-1, 784),(-1, 1)], - lod_levels=[0, 0], - dtypes=["float32", "int32"]) -data_file = fluid.layers.io.batch(data_file, batch_size=4) +data_file = fluid.layers.io.open_files( + filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0), + thread_num=1, + shapes=[(-1, 784),(-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int32"]) +img, label = fluid.layers.io.read_file(data_files) +... ``` diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc index 6b6d4470268..282ec3f36b9 100644 --- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc +++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc @@ -65,22 +65,20 @@ class CreateRecordIOReaderOp : public framework::OperatorBase { static_cast(shape_concat.size()), "The accumulate of all ranks should be equal to the " "shape concat's length."); - auto filenames = Attr>("filenames"); + std::string filename = Attr("filename"); auto* out = scope.FindVar(Output("Out")) ->template GetMutable(); - for (auto& fn : filenames) { - out->Reset( - new RecordIOFileReader(fn, RestoreShapes(shape_concat, ranks))); - } + + out->Reset(new RecordIOFileReader( + filename, RestoreShapes(shape_concat, ranks))); } }; class CreateRecordIOReaderOpMaker : public FileReaderMakerBase { protected: void Apply() override { - AddAttr>("filenames", - "The filenames of record io reader"); + AddAttr("filename", "The filename of record io reader"); AddComment(R"DOC( CreateRecordIOReader Operator diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index b9d55827304..8758ac9f94a 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -21,7 +21,7 @@ from ..layer_helper import LayerHelper from ..executor import global_scope __all__ = [ - 'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_files', + 'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer', 'random_data_generator', 'Preprocessor' ] @@ -291,12 +291,12 @@ def _copy_reader_create_op_(block, op): return new_op -def open_recordio_files(filenames, - shapes, - lod_levels, - dtypes, - pass_num=1, - for_parallel=True): +def open_recordio_file(filename, + shapes, + lod_levels, + dtypes, + pass_num=1, + for_parallel=True): """ Open a RecordIO file @@ -304,7 +304,7 @@ def open_recordio_files(filenames, Via the Reader Variable, we can get data from the given RecordIO file. Args: - filename(str) or list(str): The RecordIO file's name. + filename(str): The RecordIO file's name. shapes(list): List of tuples which declaring data shapes. lod_levels(list): List of ints which declaring data lod_level. dtypes(list): List of strs which declaring data type. @@ -336,8 +336,6 @@ def open_recordio_files(filenames, ranks.append(len(shape)) var_name = unique_name('open_recordio_file') - if isinstance(filenames, str): - filenames = [filenames] startup_blk = default_startup_program().current_block() startup_var = startup_blk.create_var(name=var_name) @@ -347,7 +345,7 @@ def open_recordio_files(filenames, attrs={ 'shape_concat': shape_concat, 'lod_levels': lod_levels, - 'filenames': filenames, + 'filename': filename, 'ranks': ranks }) diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py index 9557f91bb36..8d48e9abef0 100644 --- a/python/paddle/fluid/recordio_writer.py +++ b/python/paddle/fluid/recordio_writer.py @@ -12,9 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import core import contextlib -__all__ = ['convert_reader_to_recordio_file'] +__all__ = [ + 'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files' +] @contextlib.contextmanager @@ -48,7 +51,7 @@ def convert_reader_to_recordio_file( def convert_reader_to_recordio_files( - filename_suffix, + filename, batch_per_file, reader_creator, feeder, @@ -57,13 +60,16 @@ def convert_reader_to_recordio_files( feed_order=None): if feed_order is None: feed_order = feeder.feed_names + f_name, f_ext = os.path.splitext(filename) + assert (f_ext == ".recordio") + lines = [] f_idx = 0 counter = 0 for idx, batch in enumerate(reader_creator()): lines.append(batch) if idx >= batch_per_file and idx % batch_per_file == 0: - filename = "%s-%05d" % (filename_suffix, f_idx) + filename = "%s-%05d%s" % (f_name, f_idx, f_ext) with create_recordio_writer(filename, compressor, max_num_records) as writer: for l in lines: diff --git a/tools/codestyle/docstring_checker.pyc b/tools/codestyle/docstring_checker.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07e875aec6c9bae8002bde4223348c6a29647b03 GIT binary patch literal 11769 zcmdT~%X1XR8SmL$t+b1Tc#4Oy8S`8MD~X3+M8*~fV-j0Xvp5to)Ogt$Nh8hftfpse z$%aa$aOH8#E%^^pm8zUlxutT5Z$2h}Kyu6}hg439EBXDto}FEU>_FK9N7A&qzkYxH zy}R@O7%8{*jJ)4a=}#X2-^8bXiNsTC4ymQQoC+;9hm6VR)Lc&Tc@^eWv!Lb*syU$M z29(7X1r-)mb5P9aQHDS=gNEPIiB>FmX6Ypt^I?AVCV)AExb6bRfzlxi+FjUwP(V;15<<$#sa49H;yjXU1$HB> zYi%z^p{MO+(YM?6nACwirPN^iq2Kgdi5@T8_{k2COO^wC9+^Z=59fvhbaQf^c~Yt6 zDI6oud$v+Q>g3s&xey4&frB`Nlp3L5kIfmeR{P=n(>Nj54O&6sx`#O@Csx*|m2HuW zy~LVnBnw?K1!Ye43c3D5zQ0gVQ-Snx&LT1<^o9aj663o6c8+@k^I)xLd8m`E0HnMHeBxuCCTbCY#g2yg^6*DBOB z`Y%XCQrW;d)Ro^zf~ckK`8aA)c8RDWk=8nmMZ2zTa@;99y6H4(`fiw&v(Lb ztv7&>b%eQMEs9uJLX+WSiOyCcf@-EalS93yMUu$&&O6%)pvLL$^Snd!8NMPxI{R3p znm3}RS0vMm2uMU}w8J2Q;)JB)0*H*Zvy)BsA~8*pdYtHwf@IMMEK0F@yY08Uge!F+ zyj<40$y(bNaYkfRWVvJ(P&K4S)G#dEH&s3HMNx#3Hk>iE+iZU3CgpHmK}M6I)lfM% z41OK~NAIwTxlt&JhkkS0ti6C2tpmAoPlaT%kjW7^Ehi5ZDbEI&2bF4i37 zRcutwZA@+ql*e8s%!ZT-pQsdM)$w=3f(Z=dKE&j+MsBRV*M;c~)2L}Ydi z3$C#!?7I+0owh^6U9`DT*i`1mqAu$m_t!cf|~IT}UC*^@IV^) zqyiL+%13cf?SaEN56UX3m%XyUwg3iK6tEG%whC+%ux$d{4%iNX?F4L>z;**RCa^t# z?G@NQ!1fF5IlvAG?0LXm5ZFP$Y=OO~UZx*BE>7?v@nnXTcUUrTfR9K9KJQV)B8C)K)XpE}S|rAagZSLt?FqgJ&Y>44@+J*>`0akHLOp&L8mV;x@C zblG2nE9yPcj;w{rs%Exo*DEHu_rRjpwGG&{mTyygbAesA4le%%`?P?y*a%pfUlkpH z8eQNId-g|*e#@@gdNGQV{=Ps1x;Np|2k>6Z7{d5BUz?mft;rxWy_vko*RYck8@7N= z6Tm(+e#mb7aS(ZCW>i#Jq1Vscx86rbsi$LPGoR6~Vdm*Dw1H4)1G+aX~!^XEg%eX;q}v#*(5KCfdYi5ZHy~q z`*9q_rx@(9&)E{put9EEEgdCI=~gxQs)HmF`9L5u`=ixHEu&iF9d|ZzH49SN9M2b7#X^9_% zc$<+=+I}kqWqwLZBPPI?xjEC1I6dOmR8lWzoF(`^z@IHsk(vqXc`GRqZ0zpz92{F5 z8n7k^?X7yK`$h;Ty=EmwB@sLkYNCtHnniIZ5hWD(kd5i>nW97G=-5oCDn(gKOcLBm z&qE{5!=VydVd90PR4JCQ0SF=V*(+3|+@lQJ)AF05;1inDj~wF90o1#YsG*VEm{rJ? zUW1A&3N%M=J&?Lurh$g*W@Quco^C}YK=MfFg1s3sbW*C$cnSB`>>sKguJ&o?- zzttO{PB0sgNGVwB*I%^-qWGaRU;#{<3JDa9?M{h-0wN!#rthdocL^1Iss+}w02G%? zsuvbmtZUb3QXy?4&|Tr~r+d;uT)!b|$bj8&VMe5M5QIqU$Y9S)CbcqIxnCu?w_pt{ zOQn{Fl)j)KEe!+`k}?&(#vMT~%vQKTf;|tu4tf!dIjaTK4R$>o5h%_%OemzDhMik} zg)4MuDmaXV_hg5(m<4_%v>~Da3eIgJHdBY|Mk8Bf)>FppMa=tKd}$z-g5ZdC& z4XOafQ1}Uge};@fFsyl?G2WSB7K#Tb%M-;9kVj~Cj)BFQJC1}Efk=tW0SzWPh(7b+)csGP375g4T+&kQ0PzLr<8>{_X%~vP68<9azu?M478wr0xb{T3y#}+k% zCTvxdi-h+KWkZf4&JCw^h!ZeY4SB%8cFv+O5IUqD(tkwKm7ZN1G{M-^Dyra$!Je|u zO`f2q2Nwse#k=X*IZt3Ii3!y;mwF>S5piv3DDiX~)aXPok$V=OX3zsV<<2e!ihxZc zVu7t6SSun^yBBfnDWYy3#3Hf`C{k*CQvS6<6CJYw;zVjw3?A|OrWX8`zuJy3{FT!f zYSCO>S<(>!l}Cav5TADF8us&&5Y?lNjc~WwxW{8VQIzH5lz+mPa!&~%}>{{eiDs!b@iu2cu@XHp%TGCTzj<3T?Io#_+$;11M* zBfv%=2fZZX8~g&Q5ar3MA+QW@d%!ZdWD%C3M$vyj(z7x8n5N4a5*FzC@AmQW<93hI zACp($q8s-FJhvI_BJ|E=+@U+)Pek-0j3j-SSR9ec&!JYl4(!j|Y`o3(J-NIQ?V!cx zu#7W|KVNNo+QGhsh}w2sgpVpk?^fY4!&TD}o6_iQNi zWguZA3UwS-)R^2rBKJFnDf+1K7^}Y})HsE;j6uf#Z)&7k<0mq52f8d68Hw9A=s@U6 zSeOW3-?^}Xq#Q<&lrJ%0Zo(6S7@ScnWAgL@vJ`Sss-R*mKyQMZAw^MwjAS&hMO{*( zEX9O)#nE34llL=$hy#{(_>3C*3S6-(Z!=bar(C$X5mgQQmy6{C3@Gd^s+QM6`!2r8gno`&E?{*f>r%yGoZGGN$&H~=!8?oaM)5Oh2g*Oc+V9q3PF6|24 z4!H-F=nSua8;e2c#eS=gxoCt;VD||*lZ*eGR^b}u# zI^#@uP2fy2p><{6j<^9|$;rh6E@|ohUcXkmdgJP~+39!OcQ4m!(^oFLS7#(@_WjGZ zZeDX{`oPXD4n(_1uQl&Rp_p`S+&;xYvL@(mYP`pzP>9;{j>vffdFM?g(@YrEb1pKW z<16;NSYr;AphMZ?aOXXhE4j^a9%$o5`1JRX6!S%^1bs`r%ilcSnw9bXq>S94q}!3> zXG(+C2rrNK$hC2?R2m*Djg@wm_Lug{yDGuEag>5pBl$-$i@EX~@7^*}Hz)_EtR!rm z*2=5ec|4ib=HU3?r4fZ;`ZP;kemW=ElZck%FlLvD*mGM?1AlV(QzjUFVJjw0uhfUF QVT9NjS|84r2F9-c7hLv=%K!iX literal 0 HcmV?d00001 -- GitLab