diff --git a/PaddleCV/tracking/.gitmodules b/PaddleCV/tracking/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..236705c5a2a142eff9cd15e25d6f6e2531783799 --- /dev/null +++ b/PaddleCV/tracking/.gitmodules @@ -0,0 +1,3 @@ +[submodule "pytracking/pysot-toolkit"] + path = pytracking/pysot-toolkit + url = https://github.com/StrangerZhang/pysot-toolkit.git diff --git a/PaddleCV/tracking/LICENSE b/PaddleCV/tracking/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..f288702d2fa16d3cdf0035b15a9fcbc552cd88e7 --- /dev/null +++ b/PaddleCV/tracking/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/PaddleCV/tracking/README.md b/PaddleCV/tracking/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e27ecb11f465e0438dae7623c3d5f6c96696e037 --- /dev/null +++ b/PaddleCV/tracking/README.md @@ -0,0 +1,309 @@ +# tracking 单目标跟踪框架 + +## 介绍 + +tracking 是基于百度深度学习框架Paddle研发的视频单目标跟踪(Visual Object Tracking, VOT)库, 整体框架参考 [pytracking](https://github.com/visionml/pytracking),其优秀的设计使得我们能够方便地将其他跟踪器如SiamFC,SiamRPN,SiamMask等融合到一个框架中,方便后续统一的实验和比较。 + +当前tracking涵盖当前目标跟踪的主流模型,包括SiamFC, SiamRPN, SiamMask, ATOM。tracking旨在给开发者提供一系列基于PaddlePaddle的便捷、高效的目标跟踪深度学习算法,后续会不断的扩展模型的丰富度。 + +ATOM 跟踪效果展示: + +![ball](./imgs/ball1.gif) + +图中,绿色框为标注的bbox,红色框为ATOM跟踪的bbox。 + +## 代码目录结构 + + +``` +imgs 包含跟踪结果的图像 + +ltr 包含模型训练代码 + └─ actors 输入数据,输出优化目标 + └─ admin 管理数据路径等 + └─ data 多线程数据读取和预处理 + └─ dataset 训练数据集读取 + └─ models 模型定义 + └─ train_settings 训练配置 + └─ trainers 模型训练器 + └─ run_training.py 模型训练入口程序 + +pytracking 包含跟踪代码 + └─ admin 管理数据路径,模型位置等 + └─ features 特征提取 + └─ libs 跟踪常用操作 + └─ parameter 跟踪器参数设置 + └─ tracker 跟踪器 + └─ utils 画图等 + └─ pysot-toolkit 评估数据集载入和指标计算 + └─ eval_benchmark.py 评估跟踪器入口程序 + └─ visualize_results_on_benchmark.ipynb 可视化跟踪结果 +``` + +## 开始使用 + +### 数据准备 + +目标跟踪的训练集和测试集是不同的,目前最好的模型往往是使用多个训练集进行训练。 + +主流的训练数据集有: +- [VID](http://bvisionweb1.cs.unc.edu/ilsvrc2015/ILSVRC2015_VID.tar.gz) +- [Microsoft COCO 2014](http://cocodataset.org/#download) +- [LaSOT](https://drive.google.com/file/d/1O2DLxPP8M4Pn4-XCttCJUW3A29tDIeNa/view) +- [GOT-10K](http://got-10k.aitestunion.com/downloads_dataset/full_data) + +下载并解压后的数据集的组织方式为: +``` +/Datasets/ + └─ ILSVRC2015_VID/ + └─ train2014/ + └─ GOT-10K/ + └─ LaSOTBenchmark/ + +``` +Datasets是数据集保存的路径。 + +注:数据集较大,请预留足够的磁盘空间。训练Siamfc时,只需要下载VID数据集,训练ATOM需要全部下载上述三个数据集。 + + +## 快速开始 + +tracking的工作环境: +- python3 +- PaddlePaddle1.7 + +> 注意:如果遇到cmath无法import的问题,建议切换Python版本,建议使用python3.6.8, python3.7.0 + +### 安装依赖 + +1. 安装paddle,需要安装1.7版本的Paddle,如低于这个版本,请升级到Paddle 1.7. +```bash +pip install paddlepaddle-gpu==1.7.0 +``` + +2. 安装第三方库,建议使用anaconda +```bash +# (可选) 0. 强烈建议新建一个 conda 环境,在安装 anaconda 后执行 +# conda create -n paddle1.7-py3.6 python=3.6 +# conda activate paddle1.7-py3.6 + +cd tracking +pip install -r requirements.txt + +# (可选) 1. 推荐安装:快速读取 jpeg 文件 +apt-get install libturbojpeg + +# (可选) 2. 推荐安装:进程控制 +apt-get install build-essential libcap-dev +pip install python-prctl +``` + + + +### 预训练 backbone 下载 + +在开始训练前,先准备SiamRPN、SiamMask、ATOM模型的Backbone预训练模型。 + +我们提供 ATOM ResNet18 和 ResNet50 的 backbone模型。可从[这里](https://paddlemodels.bj.bcebos.com/paddle_track/vot/pretrained_models.tar)下载所有预训练模型的压缩包。 +压缩包解压后的文件夹为 `pretrained_models`. 文件的目录结构如下: +``` +/pretrained_models/ + └─ atom + └─ atom_resnet18.pdparams + └─ atom_resnet50.pdparams + └─ backbone + └─ ResNet18.pdparams + └─ ResNet50.pdparams +``` +其中/pretrained_models/backbone/文件夹包含,ResNet18、ResNet50在Imagenet上的预训练模型。 + + +### 设置训练参数 + +在启动训练前,需要设置tracking使用的数据集路径,以及训练模型保存的路径,这些参数在ltr/admin/local.py中设置。 + +首先,需要先生成local.py文件。 + +```bash +# 到代码库根目录 +cd tracking + +``` +其次,设置训练模型文件保存路径:workspace_dir,backbone模型路径:backbone_dir,数据集路径等等,对于没有用到的数据集,可以不用设置其路径。 +``` +# 用你常用的编辑器编辑 ltr/admin/local.py +# 比方说,vim ltr/admin/local.py +# 其中, +# workspace_dir = './checkpoints' # 要保存训练模型的位置 +# backbone_dir = Your BACKBONE_PATH # 训练SiamFC时不需要设置 +# 并依次设定需要使用的训练数据集如 VID, LaSOT, COCO 等,比如: +# imagenet_dir = '/Datasets/ILSVRC2015/' # 设置训练集VID的路径 + +# 如果 ltr/admin/local.py 不存在,请使用代码生成 +python -c "from ltr.admin.environment import create_default_local_file; create_default_local_file()" +``` + +训练SiamFC时需要只需要配置 workspace_dir和 imagenet_dir即可,如下: +```bash + self.workspace_dir = './checkpoints' + self.imagenet_dir = '/Datasets/ILSVRC2015/' +``` +训练ATOM时,除了 workspace_dir和 imagenet_dir外,还需要指定coco, lasot, got10k的数据集路径,参考如下: +```bash + self.workspace_dir = './checkpoints' + self.lasot_dir = '/Datasets/LaSOTBenchmark/' + self.coco_dir = '/Datasets/train2014/' + self.got10k_dir = '/Datasets/GOT-10k/train' + self.imagenet_dir = '/Datasets/ILSVRC2015/' +``` +另外,训练ATOM时,需要准备got10k和lasot的数据集划分文件,方式如下: +```bash +cd ltr/data_specs/ +wget https://paddlemodels.cdn.bcebos.com/paddle_track/vot/got10k_lasot_split.tar +tar xvf got10k_lasot_split.tar +``` + + +### 启动训练 + +```bash +# 到训练代码目录 +cd ltr + +# 训练 ATOM ResNet18 +python run_training.py bbreg atom_res18_vid_lasot_coco + +# 训练 ATOM ResNet50 +python run_training.py bbreg atom_res50_vid_lasot_coco + +# 训练 SiamFC +python run_training.py siamfc siamfc_alexnet_vid +``` + + +## 模型评估 + +评估训练后的模型使用[pysot-toolkit](https://github.com/StrangerZhang/pysot-toolkit)工具包,其提供了多个单目标跟踪数据集的评估API。测试数据集建议从pysot-toolkit 提供的链接中下载。 + +准备好测试数据后,使用如下命令,克隆跟踪评估pysot-toolkit的代码模块,运行如下命令: + +```bash +cd pytracking +git clone https://github.com/StrangerZhang/pysot-toolkit.git +mv pysot-toolkit pysot_toolkit +cd pysot_toolkit +pip install -r requirements.txt +cd pysot/utils/ +python setup.py build_ext --inplace +``` + +### 测试数据集准备 +按照pysot-toolkit的方式准备数据集VOT2018,放到/Datasets 文件夹下。 + +### 设置模型评估环境 +接下来开始设置评估环境: +```bash +# 在pytracking/admin/local.py文件中设置测试数据集、待测试模型、以及测试结果的保存路径 +# 用你常用的编辑器编辑 pytracking/admin/local.py +# 比方说,vim pytracking/admin/local.py +# 其中 settings.dataset_path 和 settings.network_path 分别设置为测试集的路径和模型训练参数的路径 + +# 如果不存在 pytracking/admin/local.py,可以使用代码生成 +python -c "from pytracking.admin.environment import create_default_local_file; create_default_local_file()" +``` + +### 准备测试数据和模型 +按照pysot-toolkit的方式准备数据集VOT2018,放到settings.dataset_path指定文件夹中,或者自行设置settings.dataset_path指向测试数据集。 + + +将自己训练的模型拷贝到 `NETWORK_PATH`,或者建立软链接,如 +```bash +ln -s tracking/ltr/Logs/checkpoints/ltr/bbreg/ $NETWORK_PATH/bbreg +``` + +### 开始测试: + +测试ATOM模型: +```bash +# 在VOT2018上评测ATOM模型 +# -d VOT2018 表示使用VOT2018数据集进行评测 +# -tr bbreg.atom_res18_vid_lasot_coco 表示要评测的模型,和训练保持一致 +# -te atom.default_vot 表示加载定义超参数的文件pytracking/parameter/atom/default_vot.py +# -e 40 表示使用第40个epoch的模型进行评测,也可以设置为'range(1, 50, 1)' 表示测试从第1个epoch到第50个epoch模型 +# -n 15 表示测试15次取平均结果,默认值是1 +python eval_benchmark.py -d VOT2018 -tr bbreg.atom_res18_vid_lasot_coco -te atom.default_vot -e 40 -n 15 +``` + +测试SiamFC +``` +# 在VOT2018上测试SiamFC +python eval_benchmark.py -d VOT2018 -tr siamfc.siamfc_alexnet_vid -te siamfc.default -e 'range(1, 50, 1)' +``` + + + +## 跟踪结果可视化 + + +在数据集上评测完后,可以通过可视化跟踪器的结果来定位问题。我们提供下面的方法来可视化跟踪结果: +```bash +cd pytracking + +# 开启 jupyter notebook,请留意终端是否输出 token +jupyter notebook --ip 0.0.0.0 --port 8888 +``` + +在你的浏览器中输入服务器的 IP 地址加上端口号,若是在本地执行则打开 +`http://localhost:8888`。若需要输入 token 请查看执行 `jupyter notebook --ip 0.0.0.0 --port 8888` 命令时的终端输出。 + +打开网页之后,打开 `visualize_results_on_benchmark.ipynb` 来可视化结果。 + +## 指标结果 + +| 数据集 | 模型 | Backbone | 论文结果 | 训练结果 | 模型| +| :-------: | :-------: | :---: | :---: | :---------: |:---------: | +|VOT2018| ATOM | Res18 | EAO: 0.401 | 0.399 | [model]() | +|VOT2018| ATOM | AlexNet | EAO: 0.188 | 0.211 | [model]() | + +## 引用与参考 + +SiamFC **[[Paper]](https://arxiv.org/pdf/1811.07628.pdf) [[Code]](https://www.robots.ox.ac.uk/~luca/siamese-fc.html)** + + @inproceedings{bertinetto2016fully, + title={Fully-convolutional siamese networks for object tracking}, + author={Bertinetto, Luca and Valmadre, Jack and Henriques, Joao F and Vedaldi, Andrea and Torr, Philip HS}, + booktitle={European conference on computer vision}, + pages={850--865}, + year={2016}, + organization={Springer} + } + +ATOM **[[Paper]](https://arxiv.org/pdf/1811.07628.pdf) [[Raw results]](https://drive.google.com/drive/folders/1MdJtsgr34iJesAgL7Y_VelP8RvQm_IG_) [[Models]](https://drive.google.com/open?id=1EsNSQr25qfXHYLqjZaVZElbGdUg-nyzd) [[Training Code]](https://github.com/visionml/pytracking/blob/master/ltr/README.md#ATOM) [[Tracker Code]](https://github.com/visionml/pytracking/blob/master/pytracking/README.md#ATOM)** + + @inproceedings{danelljan2019atom, + title={Atom: Accurate tracking by overlap maximization}, + author={Danelljan, Martin and Bhat, Goutam and Khan, Fahad Shahbaz and Felsberg, Michael}, + booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, + pages={4660--4669}, + year={2019} + } + +DiMP **[[Paper]](https://arxiv.org/pdf/1904.07220v1.pdf) [[Raw results]](https://drive.google.com/drive/folders/15mpUAJmzxemnOC6gmvMTCDJ-0v6hxJ7y) [[Models]](https://drive.google.com/open?id=1YEJySjhFokyQ6zgQg6vFAnzEFi1Onq7G) [[Training Code]](https://github.com/visionml/pytracking/blob/master/ltr/README.md#DiMP) [[Tracker Code]](https://github.com/visionml/pytracking/blob/master/pytracking/README.md#DiMP)** + + @inproceedings{bhat2019learning, + title={Learning discriminative model prediction for tracking}, + author={Bhat, Goutam and Danelljan, Martin and Gool, Luc Van and Timofte, Radu}, + booktitle={Proceedings of the IEEE International Conference on Computer Vision}, + pages={6182--6191}, + year={2019} + } + +ECO **[[Paper]](https://arxiv.org/pdf/1611.09224.pdf) [[Models]](https://drive.google.com/open?id=1aWC4waLv_te-BULoy0k-n_zS-ONms21S) [[Tracker Code]](https://github.com/visionml/pytracking/blob/master/pytracking/README.md#ECO)** + + @inproceedings{danelljan2017eco, + title={Eco: Efficient convolution operators for tracking}, + author={Danelljan, Martin and Bhat, Goutam and Shahbaz Khan, Fahad and Felsberg, Michael}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={6638--6646}, + year={2017} + } diff --git a/PaddleCV/tracking/imgs/ball1.gif b/PaddleCV/tracking/imgs/ball1.gif new file mode 100644 index 0000000000000000000000000000000000000000..e8fd8ca45a55ebe13e0d98adf39db3bc4eeac7d6 Binary files /dev/null and b/PaddleCV/tracking/imgs/ball1.gif differ diff --git a/PaddleCV/tracking/ltr/actors/__init__.py b/PaddleCV/tracking/ltr/actors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b89e0f6077181d2d66cd2f546a2d9bf7aac7db4 --- /dev/null +++ b/PaddleCV/tracking/ltr/actors/__init__.py @@ -0,0 +1,3 @@ +from .base_actor import BaseActor +from .bbreg import AtomActor +from .siamfc import SiamFCActor diff --git a/PaddleCV/tracking/ltr/actors/base_actor.py b/PaddleCV/tracking/ltr/actors/base_actor.py new file mode 100644 index 0000000000000000000000000000000000000000..b4b456c97a1d1a0b267e058dd553c07d3b94367f --- /dev/null +++ b/PaddleCV/tracking/ltr/actors/base_actor.py @@ -0,0 +1,26 @@ +from pytracking.libs import TensorDict + + +class BaseActor: + """ Base class for actor. The actor class handles the passing of the data through the network + and calculation the loss""" + + def __init__(self, net, objective): + """ + args: + net - The network to train + objective - The loss function + """ + self.net = net + self.objective = objective + + def train(self): + """ Set whether the network is in train mode. + args: + mode (True) - Bool specifying whether in training mode. + """ + self.net.train() + + def eval(self): + """ Set network to eval mode""" + self.net.eval() diff --git a/PaddleCV/tracking/ltr/actors/bbreg.py b/PaddleCV/tracking/ltr/actors/bbreg.py new file mode 100644 index 0000000000000000000000000000000000000000..ed74bd7ee19ab45b3d6c982abe6e7348657f3b67 --- /dev/null +++ b/PaddleCV/tracking/ltr/actors/bbreg.py @@ -0,0 +1,38 @@ +from . import BaseActor +import paddle.fluid as fluid + + +class AtomActor(BaseActor): + """ Actor for training the IoU-Net in ATOM""" + + def __call__(self, data): + """ + args: + data - The input data, should contain the fields 'train_images', 'test_images', 'train_anno', + 'test_proposals' and 'proposal_iou'. + + returns: + loss - the training loss + states - dict containing detailed losses + """ + # Run network to obtain IoU prediction for each proposal in 'test_proposals' + iou_pred = self.net(data['train_images'], data['test_images'], + data['train_anno'], data['test_proposals']) + + iou_pred = fluid.layers.reshape(iou_pred, [-1, iou_pred.shape[2]]) + iou_gt = fluid.layers.reshape(data['proposal_iou'], + [-1, data['proposal_iou'].shape[2]]) + + # Compute loss + loss = self.objective(iou_pred, iou_gt) + loss = fluid.layers.mean(loss) + + # Use scale loss if exists + scale_loss = getattr(self.net, "scale_loss", None) + if callable(scale_loss): + loss = scale_loss(loss) + + # Return training stats + stats = {'Loss/total': loss.numpy(), 'Loss/iou': loss.numpy()} + + return loss, stats diff --git a/PaddleCV/tracking/ltr/actors/siamfc.py b/PaddleCV/tracking/ltr/actors/siamfc.py new file mode 100644 index 0000000000000000000000000000000000000000..d42970875a8bcb808ec23340307cbaef795332ca --- /dev/null +++ b/PaddleCV/tracking/ltr/actors/siamfc.py @@ -0,0 +1,46 @@ +import numpy as np +import paddle.fluid as fluid + +from . import BaseActor + + +class SiamFCActor(BaseActor): + """ Actor for training the IoU-Net in ATOM""" + + def __init__(self, net, objective, batch_size, shape, radius, stride): + super().__init__(net, objective) + self.label_mask, self.label_weights = self._creat_gt_mask( + batch_size, shape, radius, stride) + + def _creat_gt_mask(self, batch_size, shape, radius, stride): + h, w = shape + y = np.arange(h, dtype=np.float32) - (h - 1) / 2. + x = np.arange(w, dtype=np.float32) - (w - 1) / 2. + y, x = np.meshgrid(y, x) + dist = np.sqrt(x**2 + y**2) + mask = np.zeros((h, w)) + mask[dist <= radius / stride] = 1 + mask = mask[np.newaxis, :, :] + weights = np.ones_like(mask) + weights[mask == 1] = 0.5 / np.sum(mask == 1) + weights[mask == 0] = 0.5 / np.sum(mask == 0) + mask = np.repeat(mask, batch_size, axis=0)[:, np.newaxis, :, :] + weights = np.repeat(weights, batch_size, axis=0)[:, np.newaxis, :, :] + weights = fluid.dygraph.to_variable(weights.astype(np.float32)) + mask = fluid.dygraph.to_variable(mask.astype(np.float32)) + return mask, weights + + def __call__(self, data): + # Run network to obtain IoU prediction for each proposal in 'test_proposals' + target_estimations = self.net(data['train_images'], data['test_images']) + + # weighted loss + loss_mat = fluid.layers.sigmoid_cross_entropy_with_logits( + target_estimations, self.label_mask, normalize=False) + loss = fluid.layers.elementwise_mul(loss_mat, self.label_weights) + loss = fluid.layers.reduce_sum(loss) / loss.shape[0] + + # Return training stats + stats = {'Loss/total': loss.numpy(), 'Loss/center': loss.numpy()} + + return loss, stats diff --git a/PaddleCV/tracking/ltr/admin/__init__.py b/PaddleCV/tracking/ltr/admin/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PaddleCV/tracking/ltr/admin/environment.py b/PaddleCV/tracking/ltr/admin/environment.py new file mode 100644 index 0000000000000000000000000000000000000000..590d56d878298c316eb38ac37be75a86a19c9d64 --- /dev/null +++ b/PaddleCV/tracking/ltr/admin/environment.py @@ -0,0 +1,53 @@ +import importlib +import os +from collections import OrderedDict + + +def create_default_local_file(): + path = os.path.join(os.path.dirname(__file__), 'local.py') + + empty_str = '\'\'' + default_settings = OrderedDict({ + 'workspace_dir': empty_str, + 'tensorboard_dir': 'self.workspace_dir + \'/tensorboard/\'', + 'backbone_dir': empty_str, + 'lasot_dir': empty_str, + 'got10k_dir': empty_str, + 'trackingnet_dir': empty_str, + 'coco_dir': empty_str, + 'imagenet_dir': empty_str, + 'imagenetdet_dir': empty_str + }) + + comment = { + 'workspace_dir': 'Base directory for saving network checkpoints.', + 'tensorboard_dir': 'Directory for tensorboard files.' + } + + with open(path, 'w') as f: + f.write('class EnvironmentSettings:\n') + f.write(' def __init__(self):\n') + + for attr, attr_val in default_settings.items(): + comment_str = None + if attr in comment: + comment_str = comment[attr] + if comment_str is None: + f.write(' self.{} = {}\n'.format(attr, attr_val)) + else: + f.write(' self.{} = {} # {}\n'.format(attr, attr_val, + comment_str)) + + +def env_settings(): + env_module_name = 'ltr.admin.local' + try: + env_module = importlib.import_module(env_module_name) + return env_module.EnvironmentSettings() + except: + env_file = os.path.join(os.path.dirname(__file__), 'local.py') + + create_default_local_file() + raise RuntimeError( + 'YOU HAVE NOT SETUP YOUR local.py!!!\n Go to "{}" and set all the paths you need. Then try to run again.'. + format(env_file)) diff --git a/PaddleCV/tracking/ltr/admin/local.py b/PaddleCV/tracking/ltr/admin/local.py new file mode 100644 index 0000000000000000000000000000000000000000..f598f81482ab3bb74365b2dfc3da04362825f4b6 --- /dev/null +++ b/PaddleCV/tracking/ltr/admin/local.py @@ -0,0 +1,11 @@ +class EnvironmentSettings: + def __init__(self): + self.workspace_dir = '' # Base directory for saving network checkpoints. + self.tensorboard_dir = self.workspace_dir + '/tensorboard/' # Directory for tensorboard files. + self.backbone_dir = '' + self.lasot_dir = '' + self.got10k_dir = '' + self.trackingnet_dir = '' + self.coco_dir = '' + self.imagenet_dir = '' + self.imagenetdet_dir = '' diff --git a/PaddleCV/tracking/ltr/admin/model_constructor.py b/PaddleCV/tracking/ltr/admin/model_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..89ed126ab88b6c52edb6e9d0ccaac233eecff714 --- /dev/null +++ b/PaddleCV/tracking/ltr/admin/model_constructor.py @@ -0,0 +1,48 @@ +from functools import wraps +import importlib + + +def model_constructor(f): + """ Wraps the function 'f' which returns the network. An extra field 'constructor' is added to the network returned + by 'f'. This field contains an instance of the 'NetConstructor' class, which contains the information needed to + re-construct the network, such as the name of the function 'f', the function arguments etc. Thus, the network can + be easily constructed from a saved checkpoint by calling NetConstructor.get() function. + """ + + @wraps(f) + def f_wrapper(*args, **kwds): + net_constr = NetConstructor(f.__name__, f.__module__, args, kwds) + output = f(*args, **kwds) + if isinstance(output, (tuple, list)): + # Assume first argument is the network + output[0].constructor = net_constr + else: + output.constructor = net_constr + return output + + return f_wrapper + + +class NetConstructor: + """ Class to construct networks. Takes as input the function name (e.g. atom_resnet18), the name of the module + which contains the network function (e.g. ltr.models.bbreg.atom) and the arguments for the network + function. The class object can then be stored along with the network weights to re-construct the network.""" + + def __init__(self, fun_name, fun_module, args, kwds): + """ + args: + fun_name - The function which returns the network + fun_module - the module which contains the network function + args - arguments which are passed to the network function + kwds - arguments which are passed to the network function + """ + self.fun_name = fun_name + self.fun_module = fun_module + self.args = args + self.kwds = kwds + + def get(self): + """ Rebuild the network by calling the network function with the correct arguments. """ + net_module = importlib.import_module(self.fun_module) + net_fun = getattr(net_module, self.fun_name) + return net_fun(*self.args, **self.kwds) diff --git a/PaddleCV/tracking/ltr/admin/settings.py b/PaddleCV/tracking/ltr/admin/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..2698b2427357590287092b539bc863447ffe391d --- /dev/null +++ b/PaddleCV/tracking/ltr/admin/settings.py @@ -0,0 +1,12 @@ +from ltr.admin.environment import env_settings + + +class Settings: + """ Training settings, e.g. the paths to datasets and networks.""" + + def __init__(self): + self.set_default() + + def set_default(self): + self.env = env_settings() + self.use_gpu = True diff --git a/PaddleCV/tracking/ltr/admin/stats.py b/PaddleCV/tracking/ltr/admin/stats.py new file mode 100644 index 0000000000000000000000000000000000000000..a4b46ed18955afc3b1d7a6c9f28400998a0a2f79 --- /dev/null +++ b/PaddleCV/tracking/ltr/admin/stats.py @@ -0,0 +1,70 @@ +class StatValue: + def __init__(self): + self.clear() + + def reset(self): + self.val = 0 + + def clear(self): + self.reset() + self.history = [] + + def update(self, val): + self.val = val + self.history.append(self.val) + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self): + self.clear() + self.has_new_data = False + + def reset(self): + self.avg = 0 + self.val = 0 + self.sum = 0 + self.count = 0 + + def clear(self): + self.reset() + self.history = [] + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def new_epoch(self): + if self.count > 0: + self.history.append(self.avg) + self.reset() + self.has_new_data = True + else: + self.has_new_data = False + + +def topk_accuracy(output, target, topk=(1, )): + """Computes the precision@k for the specified values of k""" + single_input = not isinstance(topk, (tuple, list)) + if single_input: + topk = (topk, ) + + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)[0] + res.append(correct_k * 100.0 / batch_size) + + if single_input: + return res[0] + + return res diff --git a/PaddleCV/tracking/ltr/admin/tensorboard.py b/PaddleCV/tracking/ltr/admin/tensorboard.py new file mode 100644 index 0000000000000000000000000000000000000000..c6d4de6965a67ee46359c596df264956f52a4173 --- /dev/null +++ b/PaddleCV/tracking/ltr/admin/tensorboard.py @@ -0,0 +1,29 @@ +import os +from collections import OrderedDict +from tensorboardX import SummaryWriter + + +class TensorboardWriter: + def __init__(self, directory, loader_names): + self.directory = directory + self.writer = OrderedDict({ + name: SummaryWriter(os.path.join(self.directory, name)) + for name in loader_names + }) + + def write_info(self, module_name, script_name, description): + tb_info_writer = SummaryWriter(os.path.join(self.directory, 'info')) + tb_info_writer.add_text('Modulet_name', module_name) + tb_info_writer.add_text('Script_name', script_name) + tb_info_writer.add_text('Description', description) + tb_info_writer.close() + + def write_epoch(self, stats: OrderedDict, epoch: int, ind=-1): + for loader_name, loader_stats in stats.items(): + if loader_stats is None: + continue + for var_name, val in loader_stats.items(): + if hasattr(val, 'history') and getattr(val, 'has_new_data', + True): + self.writer[loader_name].add_scalar(var_name, + val.history[ind], epoch) diff --git a/PaddleCV/tracking/ltr/data/__init__.py b/PaddleCV/tracking/ltr/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3ffe126ba0443207aa426d0f3e2b19272de6236d --- /dev/null +++ b/PaddleCV/tracking/ltr/data/__init__.py @@ -0,0 +1 @@ +from .loader import LTRLoader diff --git a/PaddleCV/tracking/ltr/data/image_loader.py b/PaddleCV/tracking/ltr/data/image_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..12bda4ee44ca382522119b692d70360a50bb8ba0 --- /dev/null +++ b/PaddleCV/tracking/ltr/data/image_loader.py @@ -0,0 +1,63 @@ +import jpeg4py +import cv2 as cv +import lmdb +import numpy as np + + +def default_image_loader(path): + """The default image loader, reads the image from the given path. It first tries to use the jpeg4py_loader, + but reverts to the opencv_loader if the former is not available.""" + if default_image_loader.use_jpeg4py is None: + # Try using jpeg4py + im = jpeg4py_loader(path) + if im is None: + default_image_loader.use_jpeg4py = False + print('Using opencv_loader instead.') + else: + default_image_loader.use_jpeg4py = True + return im + if default_image_loader.use_jpeg4py: + return jpeg4py_loader(path) + return opencv_loader(path) + + +default_image_loader.use_jpeg4py = None + + +def jpeg4py_loader(path): + """ Image reading using jpeg4py (https://github.com/ajkxyz/jpeg4py)""" + try: + return jpeg4py.JPEG(path).decode() + except Exception as e: + print('ERROR: Could not read image "{}"'.format(path)) + print(e) + return None + + +def opencv_loader(path): + """ Read image using opencv's imread function and returns it in rgb format""" + try: + im = cv.imread(path, cv.IMREAD_COLOR) + # convert to rgb and return + return cv.cvtColor(im, cv.COLOR_BGR2RGB) + except Exception as e: + print('ERROR: Could not read image "{}"'.format(path)) + print(e) + return None + + +def lmdb_loader(path, lmdb_path=None): + try: + if lmdb_loader.txn is None: + db = lmdb.open(lmdb_path, readonly=True, map_size=int(300e9)) + lmdb_loader.txn = db.begin(write=False) + img_buffer = lmdb_loader.txn.get(path.encode()) + img_buffer = np.frombuffer(img_buffer, np.uint8) + return cv.imdecode(img_buffer, cv.IMREAD_COLOR) + except Exception as e: + print('ERROR: Could not read image "{}"'.format(path)) + print(e) + return None + + +lmdb_loader.txn = None diff --git a/PaddleCV/tracking/ltr/data/loader.py b/PaddleCV/tracking/ltr/data/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..0d7c8f11c95cb3621fe57f125efb8fce6e5434e8 --- /dev/null +++ b/PaddleCV/tracking/ltr/data/loader.py @@ -0,0 +1,115 @@ +import os +import signal +import sys + +import dataflow as df +import numpy as np + + +# handle terminate reader process, do not print stack frame +def _reader_quit(signum, frame): + print("Reader process exit.") + sys.exit() + + +def _term_group(sig_num, frame): + print('pid {} terminated, terminate group ' + '{}...'.format(os.getpid(), os.getpgrp())) + os.killpg(os.getpgid(os.getpid()), signal.SIGKILL) + + +signal.signal(signal.SIGTERM, _reader_quit) +signal.signal(signal.SIGINT, _term_group) + + +class LTRLoader(df.DataFlow): + """ + Data loader. Combines a dataset and a sampler, and provides + single- or multi-process iterators over the dataset. + + Note: an additional option stack_dim is available to + select along which dimension the data should be stacked to form a batch. + + Arguments: + dataset (Dataset): dataset from which to load the data. + batch_size (int, optional): how many samples per batch to load + (default: 1). + shuffle (bool, optional): set to ``True`` to have the data reshuffled + at every epoch (default: False). + sampler (Sampler, optional): defines the strategy to draw samples from + the dataset. If specified, ``shuffle`` must be False. + batch_sampler (Sampler, optional): like sampler, but returns a batch of + indices at a time. Mutually exclusive with batch_size, shuffle, + sampler, and drop_last. + num_workers (int, optional): how many subprocesses to use for data + loading. 0 means that the data will be loaded in the main process. + (default: 0) + collate_fn (callable, optional): merges a list of samples to form a mini-batch. + stack_dim (int): Dimension along which to stack to form the batch. (default: 0) + pin_memory (bool, optional): If ``True``, the data loader will copy tensors + into CUDA pinned memory before returning them. + drop_last (bool, optional): set to ``True`` to drop the last incomplete batch, + if the dataset size is not divisible by the batch size. If ``False`` and + the size of dataset is not divisible by the batch size, then the last batch + will be smaller. (default: False) + timeout (numeric, optional): if positive, the timeout value for collecting a batch + from workers. Should always be non-negative. (default: 0) + worker_init_fn (callable, optional): If not None, this will be called on each + worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as + input, after seeding and before data loading. (default: None) + + + .. warning:: If ``spawn`` start method is used, :attr:`worker_init_fn` cannot be an + unpicklable object, e.g., a lambda function. + """ + + __initialized = False + + def __init__(self, + name, + dataset, + training=True, + batch_size=1, + shuffle=False, + sampler=None, + batch_sampler=None, + num_workers=0, + epoch_interval=1, + collate_fn=None, + stack_dim=0, + pin_memory=False, + drop_last=False, + timeout=0, + worker_init_fn=None): + + super().__init__() + + ds = df.RepeatedData(dataset, -1) + ds = df.MultiProcessRunnerZMQ(ds, num_proc=num_workers, hwm=300) + # ds = df.MultiThreadRunner(lambda: ds, num_prefetch=1024, num_thread=num_workers) + ds = df.BatchData(ds, batch_size) + self.ds = ds + + self.name = name + self.training = training + self.epoch_interval = epoch_interval + self.stack_dim = stack_dim + self.batches_per_epoch = len(dataset) // batch_size + + def __len__(self): + return self.batches_per_epoch + + def __iter__(self): + if not self.__initialized: + self.reset_state() + self.__initialized = True + + for d in self.ds: + if self.stack_dim > 0: + for k, v in d.items(): + if len(v.shape) >= self.stack_dim + 1: + d[k] = np.swapaxes(v, 0, self.stack_dim) + yield d + + def reset_state(self): + self.ds.reset_state() diff --git a/PaddleCV/tracking/ltr/data/processing.py b/PaddleCV/tracking/ltr/data/processing.py new file mode 100644 index 0000000000000000000000000000000000000000..ab207da0020d38ce47419c0053bab12a37bcf81b --- /dev/null +++ b/PaddleCV/tracking/ltr/data/processing.py @@ -0,0 +1,262 @@ +import numpy as np + +from ltr.data import transforms +import ltr.data.processing_utils as prutils +from pytracking.libs import TensorDict + + +class BaseProcessing: + """ Base class for Processing. Processing class is used to process the data returned by a dataset, before passing it + through the network. For example, it can be used to crop a search region around the object, apply various data + augmentations, etc.""" + + def __init__(self, + transform=transforms.ToArray(), + train_transform=None, + test_transform=None, + joint_transform=None): + """ + args: + transform - The set of transformations to be applied on the images. Used only if train_transform or + test_transform is None. + train_transform - The set of transformations to be applied on the train images. If None, the 'transform' + argument is used instead. + test_transform - The set of transformations to be applied on the test images. If None, the 'transform' + argument is used instead. + joint_transform - The set of transformations to be applied 'jointly' on the train and test images. For + example, it can be used to convert both test and train images to grayscale. + """ + self.transform = { + 'train': transform if train_transform is None else train_transform, + 'test': transform if test_transform is None else test_transform, + 'joint': joint_transform + } + + def __call__(self, data: TensorDict): + raise NotImplementedError + + +class SiamFCProcessing(BaseProcessing): + def __init__(self, + search_area_factor, + output_sz, + center_jitter_factor, + scale_jitter_factor, + mode='pair', + scale_type='context', + border_type='meanpad', + *args, + **kwargs): + super().__init__(*args, **kwargs) + self.search_area_factor = search_area_factor + self.output_sz = output_sz + self.center_jitter_factor = center_jitter_factor + self.scale_jitter_factor = scale_jitter_factor + self.mode = mode + self.scale_type = scale_type + self.border_type = border_type + + def _get_jittered_box(self, box, mode, rng): + jittered_size = box[2:4] * np.exp( + rng.randn(2) * self.scale_jitter_factor[mode]) + max_offset = (np.sqrt(jittered_size.prod()) * + self.center_jitter_factor[mode]) + jittered_center = box[0:2] + 0.5 * box[2:4] + max_offset * (rng.rand(2) + - 0.5) + + return np.concatenate( + (jittered_center - 0.5 * jittered_size, jittered_size), axis=0) + + def __call__(self, data: TensorDict, rng=None): + # Apply joint transforms + if self.transform['joint'] is not None: + num_train_images = len(data['train_images']) + all_images = data['train_images'] + data['test_images'] + all_images_trans = self.transform['joint'](*all_images) + + data['train_images'] = all_images_trans[:num_train_images] + data['test_images'] = all_images_trans[num_train_images:] + + for s in ['train', 'test']: + assert self.mode == 'sequence' or len(data[s + '_images']) == 1, \ + "In pair mode, num train/test frames must be 1" + + # Add a uniform noise to the center pos + jittered_anno = [ + self._get_jittered_box(a, s, rng) for a in data[s + '_anno'] + ] + + # Crop image region centered at jittered_anno box + try: + crops, boxes = prutils.jittered_center_crop( + data[s + '_images'], + jittered_anno, + data[s + '_anno'], + self.search_area_factor[s], + self.output_sz[s], + scale_type=self.scale_type, + border_type=self.border_type) + except Exception as e: + print('{}, anno: {}'.format(data['dataset'], data[s + '_anno'])) + raise e + + # Apply transforms + data[s + '_images'] = [self.transform[s](x) for x in crops] + data[s + '_anno'] = boxes + + # Prepare output + if self.mode == 'sequence': + data = data.apply(prutils.stack_tensors) + else: + data = data.apply(lambda x: x[0] if isinstance(x, list) else x) + + return data + + +class ATOMProcessing(BaseProcessing): + """ The processing class used for training ATOM. The images are processed in the following way. + First, the target bounding box is jittered by adding some noise. Next, a square region (called search region ) + centered at the jittered target center, and of area search_area_factor^2 times the area of the jittered box is + cropped from the image. The reason for jittering the target box is to avoid learning the bias that the target is + always at the center of the search region. The search region is then resized to a fixed size given by the + argument output_sz. A set of proposals are then generated for the test images by jittering the ground truth box. + + """ + + def __init__(self, + search_area_factor, + output_sz, + center_jitter_factor, + scale_jitter_factor, + proposal_params, + mode='pair', + *args, + **kwargs): + """ + args: + search_area_factor - The size of the search region relative to the target size. + output_sz - An integer, denoting the size to which the search region is resized. The search region is always + square. + center_jitter_factor - A dict containing the amount of jittering to be applied to the target center before + extracting the search region. See _get_jittered_box for how the jittering is done. + scale_jitter_factor - A dict containing the amount of jittering to be applied to the target size before + extracting the search region. See _get_jittered_box for how the jittering is done. + proposal_params - Arguments for the proposal generation process. See _generate_proposals for details. + mode - Either 'pair' or 'sequence'. If mode='sequence', then output has an extra dimension for frames + """ + super().__init__(*args, **kwargs) + self.search_area_factor = search_area_factor + self.output_sz = output_sz + self.center_jitter_factor = center_jitter_factor + self.scale_jitter_factor = scale_jitter_factor + self.proposal_params = proposal_params + self.mode = mode + + def _get_jittered_box(self, box, mode, rng): + """ Jitter the input box + args: + box - input bounding box + mode - string 'train' or 'test' indicating train or test data + + returns: + Variable - jittered box + """ + + jittered_size = box[2:4] * np.exp( + rng.randn(2) * self.scale_jitter_factor[mode]) + max_offset = (np.sqrt(jittered_size.prod()) * + self.center_jitter_factor[mode]) + jittered_center = box[0:2] + 0.5 * box[2:4] + max_offset * (rng.rand(2) + - 0.5) + + return np.concatenate( + (jittered_center - 0.5 * jittered_size, jittered_size), axis=0) + + def _generate_proposals(self, box, rng): + """ Generates proposals by adding noise to the input box + args: + box - input box + + returns: + array - Array of shape (num_proposals, 4) containing proposals + array - Array of shape (num_proposals,) containing IoU overlap of each proposal with the input box. The + IoU is mapped to [-1, 1] + """ + # Generate proposals + num_proposals = self.proposal_params['boxes_per_frame'] + proposals = np.zeros((num_proposals, 4)) + gt_iou = np.zeros(num_proposals) + + for i in range(num_proposals): + proposals[i, :], gt_iou[i] = prutils.perturb_box( + box, + min_iou=self.proposal_params['min_iou'], + sigma_factor=self.proposal_params['sigma_factor'], + rng=rng) + + # Map to [-1, 1] + gt_iou = gt_iou * 2 - 1 + return proposals, gt_iou + + def __call__(self, data: TensorDict, rng=None): + """ + args: + data - The input data, should contain the following fields: + 'train_images' - + 'test_images' - + 'train_anno' - + 'test_anno' - + + returns: + TensorDict - output data block with following fields: + 'train_images' - + 'test_images' - + 'train_anno' - + 'test_anno' - + 'test_proposals'- + 'proposal_iou' - + """ + # Apply joint transforms + if self.transform['joint'] is not None: + num_train_images = len(data['train_images']) + all_images = data['train_images'] + data['test_images'] + all_images_trans = self.transform['joint'](*all_images) + + data['train_images'] = all_images_trans[:num_train_images] + data['test_images'] = all_images_trans[num_train_images:] + + for s in ['train', 'test']: + assert self.mode == 'sequence' or len(data[s + '_images']) == 1, \ + "In pair mode, num train/test frames must be 1" + + # Add a uniform noise to the center pos + jittered_anno = [ + self._get_jittered_box(a, s, rng) for a in data[s + '_anno'] + ] + + # Crop image region centered at jittered_anno box + try: + crops, boxes = prutils.jittered_center_crop( + data[s + '_images'], jittered_anno, data[s + '_anno'], + self.search_area_factor, self.output_sz) + except Exception as e: + print('{}, anno: {}'.format(data['dataset'], data[s + '_anno'])) + raise e + # Apply transforms + data[s + '_images'] = [self.transform[s](x) for x in crops] + data[s + '_anno'] = boxes + + # Generate proposals + frame2_proposals, gt_iou = zip( + * [self._generate_proposals(a, rng) for a in data['test_anno']]) + + data['test_proposals'] = list(frame2_proposals) + data['proposal_iou'] = list(gt_iou) + + # Prepare output + if self.mode == 'sequence': + data = data.apply(prutils.stack_tensors) + else: + data = data.apply(lambda x: x[0] if isinstance(x, list) else x) + + return data diff --git a/PaddleCV/tracking/ltr/data/processing_utils.py b/PaddleCV/tracking/ltr/data/processing_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4f38286ff823c0242f1c6024073fd8a557fb767f --- /dev/null +++ b/PaddleCV/tracking/ltr/data/processing_utils.py @@ -0,0 +1,288 @@ +import math +import numpy as np +import cv2 as cv + + +def stack_tensors(x): + if isinstance(x, list) and isinstance(x[0], np.ndarray): + return np.stack(x) + return x + + +def sample_target(im, + target_bb, + search_area_factor, + output_sz=None, + scale_type='original', + border_type='replicate'): + """ Extracts a square crop centered at target_bb box, of area search_area_factor^2 times target_bb area + + args: + im - cv image + target_bb - target box [x, y, w, h] + search_area_factor - Ratio of crop size to target size + output_sz - (float) Size to which the extracted crop is resized (always square). If None, no resizing is done. + + returns: + cv image - extracted crop + float - the factor by which the crop has been resized to make the crop size equal output_size + """ + + x, y, w, h = target_bb.tolist() + + # Crop image + if scale_type == 'original': + crop_sz = math.ceil(math.sqrt(w * h) * search_area_factor) + elif scale_type == 'context': + # some context is added into the target_size + # now, the search factor is respect to the "target + context" + # when search_factor = 1, output_size = 127 + # when search_factor = 2, output_size = 255 + context = (w + h) / 2 + base_size = math.sqrt( + (w + context) * (h + context)) # corresponds to 127 in crop + crop_sz = math.ceil(search_area_factor * base_size) + else: + raise NotImplementedError + + if crop_sz < 1: + raise Exception('Too small bounding box. w: {}, h: {}'.format(w, h)) + + x1 = round(x + 0.5 * w - crop_sz * 0.5) + x2 = x1 + crop_sz + + y1 = round(y + 0.5 * h - crop_sz * 0.5) + y2 = y1 + crop_sz + + x1_pad = max(0, -x1) + x2_pad = max(x2 - im.shape[1] + 1, 0) + + y1_pad = max(0, -y1) + y2_pad = max(y2 - im.shape[0] + 1, 0) + + # Crop target + im_crop = im[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :] + + # Pad + if border_type == 'replicate': + im_crop_padded = cv.copyMakeBorder(im_crop, y1_pad, y2_pad, x1_pad, + x2_pad, cv.BORDER_REPLICATE) + elif border_type == 'zeropad': + im_crop_padded = cv.copyMakeBorder(im_crop, y1_pad, y2_pad, x1_pad, + x2_pad, cv.BORDER_CONSTANT) + elif border_type == 'meanpad': + avg_chans = np.array( + [np.mean(im[:, :, 0]), np.mean(im[:, :, 1]), np.mean(im[:, :, 2])]) + im_crop_padded = cv.copyMakeBorder( + im_crop, + y1_pad, + y2_pad, + x1_pad, + x2_pad, + cv.BORDER_CONSTANT, + value=avg_chans) + else: + raise NotImplementedError + + if output_sz is not None: + resize_factor = output_sz / crop_sz + return cv.resize(im_crop_padded, (output_sz, output_sz)), resize_factor + else: + return im_crop_padded, 1.0 + + +def transform_image_to_crop(box_in: np.ndarray, + box_extract: np.ndarray, + resize_factor: float, + crop_sz: np.ndarray) -> np.ndarray: + """ Transform the box co-ordinates from the original image co-ordinates to the co-ordinates of the cropped image + args: + box_in - the box for which the co-ordinates are to be transformed + box_extract - the box about which the image crop has been extracted. + resize_factor - the ratio between the original image scale and the scale of the image crop + crop_sz - size of the cropped image + + returns: + array - transformed co-ordinates of box_in + """ + box_extract_center = box_extract[0:2] + 0.5 * box_extract[2:4] + + box_in_center = box_in[0:2] + 0.5 * box_in[2:4] + + box_out_center = (crop_sz - 1) / 2 + (box_in_center - box_extract_center + ) * resize_factor + box_out_wh = box_in[2:4] * resize_factor + + box_out = np.concatenate((box_out_center - 0.5 * box_out_wh, box_out_wh)) + return box_out + + +def centered_crop(frames, anno, area_factor, output_sz): + crops_resize_factors = [ + sample_target(f, a, area_factor, output_sz) + for f, a in zip(frames, anno) + ] + + frames_crop, resize_factors = zip(*crops_resize_factors) + + crop_sz = np.array([output_sz, output_sz], 'int') + + # find the bb location in the crop + anno_crop = [ + transform_image_to_crop(a, a, rf, crop_sz) + for a, rf in zip(anno, resize_factors) + ] + + return frames_crop, anno_crop + + +def jittered_center_crop(frames, + box_extract, + box_gt, + search_area_factor, + output_sz, + scale_type='original', + border_type='replicate'): + """ For each frame in frames, extracts a square crop centered at box_extract, of area search_area_factor^2 + times box_extract area. The extracted crops are then resized to output_sz. Further, the co-ordinates of the box + box_gt are transformed to the image crop co-ordinates + + args: + frames - list of frames + box_extract - list of boxes of same length as frames. The crops are extracted using anno_extract + box_gt - list of boxes of same length as frames. The co-ordinates of these boxes are transformed from + image co-ordinates to the crop co-ordinates + search_area_factor - The area of the extracted crop is search_area_factor^2 times box_extract area + output_sz - The size to which the extracted crops are resized + + returns: + list - list of image crops + list - box_gt location in the crop co-ordinates + """ + crops_resize_factors = [ + sample_target( + f, + a, + search_area_factor, + output_sz, + scale_type=scale_type, + border_type=border_type) for f, a in zip(frames, box_extract) + ] + + frames_crop, resize_factors = zip(*crops_resize_factors) + + crop_sz = np.array([output_sz, output_sz], 'int') + + # find the bb location in the crop + box_crop = [ + transform_image_to_crop(a_gt, a_ex, rf, crop_sz) + for a_gt, a_ex, rf in zip(box_gt, box_extract, resize_factors) + ] + + return frames_crop, box_crop + + +def iou(reference, proposals): + """Compute the IoU between a reference box with multiple proposal boxes. + + args: + reference - Tensor of shape (1, 4). + proposals - Tensor of shape (num_proposals, 4) + + returns: + array - shape (num_proposals,) containing IoU of reference box with each proposal box. + """ + + # Intersection box + tl = np.maximum(reference[:, :2], proposals[:, :2]) + br = np.minimum(reference[:, :2] + reference[:, 2:], + proposals[:, :2] + proposals[:, 2:]) + sz = np.clip(br - tl, 0, np.inf) + + # Area + intersection = np.prod(sz, axis=1) + union = np.prod( + reference[:, 2:], axis=1) + np.prod( + proposals[:, 2:], axis=1) - intersection + + return intersection / union + + +def rand_uniform(a, b, rng=None, shape=1): + """ sample numbers uniformly between a and b. + args: + a - lower bound + b - upper bound + shape - shape of the output tensor + + returns: + array + """ + rand = np.random.rand if rng is None else rng.rand + return (b - a) * rand(shape) + a + + +def perturb_box(box, min_iou=0.5, sigma_factor=0.1, rng=None): + """ Perturb the input box by adding gaussian noise to the co-ordinates + + args: + box - input box + min_iou - minimum IoU overlap between input box and the perturbed box + sigma_factor - amount of perturbation, relative to the box size. Can be either a single element, or a list of + sigma_factors, in which case one of them will be uniformly sampled. Further, each of the + sigma_factor element can be either a float, or a tensor + of shape (4,) specifying the sigma_factor per co-ordinate + + returns: + array - the perturbed box + """ + if rng is None: + rng = np.random + + if isinstance(sigma_factor, list): + # If list, sample one sigma_factor as current sigma factor + c_sigma_factor = rng.choice(sigma_factor) + else: + c_sigma_factor = sigma_factor + + if not isinstance(c_sigma_factor, np.ndarray): + c_sigma_factor = c_sigma_factor * np.ones(4) + + perturb_factor = np.sqrt(box[2] * box[3]) * c_sigma_factor + + # multiple tries to ensure that the perturbed box has iou > min_iou with the input box + for i_ in range(100): + c_x = box[0] + 0.5 * box[2] + c_y = box[1] + 0.5 * box[3] + c_x_per = rng.normal(c_x, perturb_factor[0]) + c_y_per = rng.normal(c_y, perturb_factor[1]) + + w_per = rng.normal(box[2], perturb_factor[2]) + h_per = rng.normal(box[3], perturb_factor[3]) + + if w_per <= 1: + w_per = box[2] * rand_uniform(0.15, 0.5, rng)[0] + + if h_per <= 1: + h_per = box[3] * rand_uniform(0.15, 0.5, rng)[0] + + box_per = np.round( + np.array( + [c_x_per - 0.5 * w_per, c_y_per - 0.5 * h_per, w_per, h_per])) + + if box_per[2] <= 1: + box_per[2] = box[2] * rand_uniform(0.15, 0.5, rng) + + if box_per[3] <= 1: + box_per[3] = box[3] * rand_uniform(0.15, 0.5, rng) + + box_iou = iou(np.reshape(box, (1, 4)), np.reshape(box_per, (1, 4))) + + # if there is sufficient overlap, return + if box_iou > min_iou: + return box_per, box_iou + + # else reduce the perturb factor + perturb_factor *= 0.9 + + return box_per, box_iou diff --git a/PaddleCV/tracking/ltr/data/sampler.py b/PaddleCV/tracking/ltr/data/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..064c604dfe14c71cdba21aea79d73af45d38f317 --- /dev/null +++ b/PaddleCV/tracking/ltr/data/sampler.py @@ -0,0 +1,180 @@ +import numpy as np +import dataflow as df +from pytracking.libs import TensorDict + + +def no_processing(data, rng=None): + return data + + +class ATOMSampler(df.RNGDataFlow): + """ Class responsible for sampling frames from training sequences to form batches. Each training sample is a + tuple consisting of i) a train frame, used to obtain the modulation vector, and ii) a set of test frames on which + the IoU prediction loss is calculated. + + The sampling is done in the following ways. First a dataset is selected at random. Next, a sequence is selected + from that dataset. A 'train frame' is then sampled randomly from the sequence. Next, depending on the + frame_sample_mode, the required number of test frames are sampled randomly, either from the range + [train_frame_id - max_gap, train_frame_id + max_gap] in the 'default' mode, or from [train_frame_id, train_frame_id + max_gap] + in the 'causal' mode. Only the frames in which the target is visible are sampled, and if enough visible frames are + not found, the 'max_gap' is incremented. + + The sampled frames are then passed through the input 'processing' function for the necessary processing- + """ + + def __init__(self, + datasets, + p_datasets, + samples_per_epoch, + max_gap, + num_test_frames=1, + processing=no_processing, + frame_sample_mode='default'): + """ + args: + datasets - List of datasets to be used for training + p_datasets - List containing the probabilities by which each dataset will be sampled + samples_per_epoch - Number of training samples per epoch + max_gap - Maximum gap, in frame numbers, between the train (reference) frame and the test frames. + num_test_frames - Number of test frames used for calculating the IoU prediction loss. + processing - An instance of Processing class which performs the necessary processing of the data. + frame_sample_mode - Either 'default' or 'causal'. If 'causal', then the test frames are sampled in a causal + manner. + """ + self.datasets = datasets + + # If p not provided, sample uniformly from all videos + if p_datasets is None: + p_datasets = [1 for d in self.datasets] + + # Normalize + p_total = sum(p_datasets) + self.p_datasets = [x / p_total for x in p_datasets] + + self.samples_per_epoch = samples_per_epoch + self.max_gap = max_gap + self.num_test_frames = num_test_frames + self.num_train_frames = 1 # Only a single train frame allowed + self.processing = processing + self.frame_sample_mode = frame_sample_mode + + def __len__(self): + return self.samples_per_epoch + + def _sample_visible_ids(self, visible, num_ids=1, min_id=None, max_id=None): + """ Samples num_ids frames between min_id and max_id for which target is visible + + args: + visible - 1d Tensor indicating whether target is visible for each frame + num_ids - number of frames to be samples + min_id - Minimum allowed frame number + max_id - Maximum allowed frame number + + returns: + list - List of sampled frame numbers. None if not sufficient visible frames could be found. + """ + if min_id is None or min_id < 0: + min_id = 0 + if max_id is None or max_id > len(visible): + max_id = len(visible) + + valid_ids = [i for i in range(min_id, max_id) if visible[i]] + + # No visible ids + if len(valid_ids) == 0: + return None + + inds = self.rng.choice( + range(len(valid_ids)), size=num_ids, replace=True) + ids = [valid_ids[ii] for ii in inds] + # return random.choices(valid_ids, k=num_ids) + return ids + + def __iter__(self): + """ + args: + index (int): Index (Ignored since we sample randomly) + + returns: + TensorDict - dict containing all the data blocks + """ + + # Select a dataset + # dataset = self.rng.choices(self.datasets, self.p_datasets)[0] + dataset_idx = self.rng.choice( + range(len(self.datasets)), p=self.p_datasets, replace=False) + dataset = self.datasets[dataset_idx] + is_video_dataset = dataset.is_video_sequence() + + min_visible_frames = 2 * (self.num_test_frames + self.num_train_frames) + enough_visible_frames = False + + # Sample a sequence with enough visible frames and get anno for the same + while not enough_visible_frames: + seq_id = self.rng.randint(0, dataset.get_num_sequences() - 1) + anno, visible = dataset.get_sequence_info(seq_id) + num_visible = np.sum(visible.astype('int64')) + enough_visible_frames = not is_video_dataset or ( + num_visible > min_visible_frames and len(visible) >= 20) + + if is_video_dataset: + train_frame_ids = None + test_frame_ids = None + gap_increase = 0 + if self.frame_sample_mode == 'default': + # Sample frame numbers + while test_frame_ids is None: + train_frame_ids = self._sample_visible_ids( + visible, num_ids=self.num_train_frames) + test_frame_ids = self._sample_visible_ids( + visible, + min_id=train_frame_ids[0] - self.max_gap - gap_increase, + max_id=train_frame_ids[0] + self.max_gap + gap_increase, + num_ids=self.num_test_frames) + gap_increase += 5 # Increase gap until a frame is found + elif self.frame_sample_mode == 'causal': + # Sample frame numbers in a causal manner, i.e. test_frame_ids > train_frame_ids + while test_frame_ids is None: + base_frame_id = self._sample_visible_ids( + visible, + num_ids=1, + min_id=self.num_train_frames - 1, + max_id=len(visible) - self.num_test_frames) + prev_frame_ids = self._sample_visible_ids( + visible, + num_ids=self.num_train_frames - 1, + min_id=base_frame_id[0] - self.max_gap - gap_increase, + max_id=base_frame_id[0]) + if prev_frame_ids is None: + gap_increase += 5 + continue + train_frame_ids = base_frame_id + prev_frame_ids + test_frame_ids = self._sample_visible_ids( + visible, + min_id=train_frame_ids[0] + 1, + max_id=train_frame_ids[0] + self.max_gap + gap_increase, + num_ids=self.num_test_frames) + gap_increase += 5 # Increase gap until a frame is found + else: + raise ValueError('Unknown frame_sample_mode.') + else: + train_frame_ids = [1] * self.num_train_frames + test_frame_ids = [1] * self.num_test_frames + + # Get frames + train_frames, train_anno, _ = dataset.get_frames(seq_id, + train_frame_ids, anno) + test_frames, test_anno, _ = dataset.get_frames(seq_id, test_frame_ids, + anno) + + # Prepare data + data = TensorDict({ + 'train_images': train_frames, + 'train_anno': train_anno, + 'test_images': test_frames, + 'test_anno': test_anno, + 'dataset': dataset.get_name() + }) + + # Send for processing + yield self.processing(data, rng=self.rng) diff --git a/PaddleCV/tracking/ltr/data/transforms.py b/PaddleCV/tracking/ltr/data/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..83c6e3611119a757612377bea6e252049f19f9fe --- /dev/null +++ b/PaddleCV/tracking/ltr/data/transforms.py @@ -0,0 +1,148 @@ +import random +import numpy as np +import math +import cv2 as cv +from paddle.fluid import layers +from pytracking.libs.paddle_utils import PTensor + + +class Transform: + """ Class for applying various image transformations.""" + + def __call__(self, *args): + rand_params = self.roll() + if rand_params is None: + rand_params = () + elif not isinstance(rand_params, tuple): + rand_params = (rand_params, ) + output = [self.transform(img, *rand_params) for img in args] + if len(output) == 1: + return output[0] + return output + + def roll(self): + return None + + def transform(self, img, *args): + """Must be deterministic""" + raise NotImplementedError + + +class Compose: + """Composes several transforms together. + + Args: + transforms (list of ``Transform`` objects): list of transforms to compose. + """ + + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, *args): + for t in self.transforms: + if not isinstance(args, tuple): + args = (args, ) + args = t(*args) + return args + + def __repr__(self): + format_string = self.__class__.__name__ + '(' + for t in self.transforms: + format_string += '\n' + format_string += ' {0}'.format(t) + format_string += '\n)' + return format_string + + +class Normalize(object): + """Normalize an tensor image with mean and standard deviation. + Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels, this transform + will normalize each channel of the input i.e. + ``input[channel] = (input[channel] - mean[channel]) / std[channel]`` + + Args: + mean (sequence): Sequence of means for each channel. + std (sequence): Sequence of standard deviations for each channel. + """ + + def __init__(self, mean, std): + self.mean = np.reshape(mean, [-1, 1, 1]) + self.std = np.reshape(std, [-1, 1, 1]) + + def __call__(self, tensor): + """ + Args: + tensor (Tensor): Tensor image of size (C, H, W) to be normalized. + + Returns: + Tensor: Normalized Tensor image. + """ + return (tensor - self.mean) / self.std + + +class ToArray(Transform): + """ Transpose image and jitter brightness""" + + def __init__(self, brightness_jitter=0.0): + self.brightness_jitter = brightness_jitter + + def __call__(self, img): + img = img.transpose((2, 0, 1)) + return img.astype('float32') / 255. + + +class ToArrayAndJitter(Transform): + """ Transpose image and jitter brightness""" + + def __init__(self, brightness_jitter=0.0): + self.brightness_jitter = brightness_jitter + + def roll(self): + return np.random.uniform( + max(0, 1 - self.brightness_jitter), 1 + self.brightness_jitter) + + def transform(self, img, brightness_factor): + # handle numpy array + img = img.transpose((2, 0, 1)) + + # backward compatibility + return np.clip( + img.astype('float32') * brightness_factor / 255.0, 0.0, 1.0) + + +class ToGrayscale(Transform): + """Converts image to grayscale with probability""" + + def __init__(self, probability=0.5): + self.probability = probability + self.color_weights = np.array( + [0.2989, 0.5870, 0.1140], dtype=np.float32) + + def roll(self): + return random.random() < self.probability + + def transform(self, img, do_grayscale): + if do_grayscale: + if isinstance(img, PTensor): + raise NotImplementedError('Implement paddle variant.') + img_gray = cv.cvtColor(img, cv.COLOR_RGB2GRAY) + return np.stack([img_gray, img_gray, img_gray], axis=2) + # return np.repeat(np.sum(img * self.color_weights, axis=2, keepdims=True).astype(np.uint8), 3, axis=2) + return img + + +class RandomHorizontalFlip(Transform): + """Horizontally flip the given NumPy Image randomly with a probability p.""" + + def __init__(self, probability=0.5): + self.probability = probability + + def roll(self): + return random.random() < self.probability + + def transform(self, img, do_flip): + if do_flip: + if isinstance(img, PTensor): + return layers.reverse(img, 2) + return np.fliplr(img).copy() + return img diff --git a/PaddleCV/tracking/ltr/dataset/__init__.py b/PaddleCV/tracking/ltr/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..330cd163c602b0eecbbf67b0758959bc179a7442 --- /dev/null +++ b/PaddleCV/tracking/ltr/dataset/__init__.py @@ -0,0 +1,8 @@ +from .lasot import Lasot +from .got10k import Got10k +from .tracking_net import TrackingNet +from .imagenetvid import ImagenetVID +from .coco_seq import MSCOCOSeq +from .vot import VOT +from .youtube_vos import VOS +from .youtube_bb import YoutubeBB diff --git a/PaddleCV/tracking/ltr/dataset/base_dataset.py b/PaddleCV/tracking/ltr/dataset/base_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..34129284ae19b431548b91c9756099c689d2aead --- /dev/null +++ b/PaddleCV/tracking/ltr/dataset/base_dataset.py @@ -0,0 +1,85 @@ +from ltr.data.image_loader import default_image_loader + + +class BaseDataset(object): + """ Base class for datasets """ + + def __init__(self, root, image_loader=default_image_loader): + """ + args: + root - The root path to the dataset + image_loader (jpeg4py_loader) - The function to read the images. jpeg4py (https://github.com/ajkxyz/jpeg4py) + is used by default. + """ + if root == '': + raise Exception( + 'The dataset path is not setup. Check your "ltr/admin/local.py".' + ) + self.root = root + self.image_loader = image_loader + + self.sequence_list = [] # Contains the list of sequences. + + def __len__(self): + """ Returns size of the dataset + returns: + int - number of samples in the dataset + """ + return self.get_num_sequences() + + def __getitem__(self, index): + """ Not to be used! Check get_frames() instead. + """ + return None + + def is_video_sequence(self): + """ Returns whether the dataset is a video dataset or an image dataset + + returns: + bool - True if a video dataset + """ + return True + + def get_name(self): + """ Name of the dataset + + returns: + string - Name of the dataset + """ + raise NotImplementedError + + def get_num_sequences(self): + """ Number of sequences in a dataset + + returns: + int - number of sequences in the dataset.""" + return len(self.sequence_list) + + def get_sequence_info(self, seq_id): + """ Returns information about a particular sequences, + + args: + seq_id - index of the sequence + + returns: + Tensor - Annotation for the sequence. A 2d tensor of shape (num_frames, 4). + Format [top_left_x, top_left_y, width, height] + Tensor - 1d Tensor specifying whether target is present (=1 )for each frame. shape (num_frames,) + """ + raise NotImplementedError + + def get_frames(self, seq_id, frame_ids, anno=None): + """ Get a set of frames from a particular sequence + + args: + seq_id - index of sequence + frame_ids - a list of frame numbers + anno(None) - The annotation for the sequence (see get_sequence_info). If None, they will be loaded. + + returns: + list - List of frames corresponding to frame_ids + list - List of annotations (tensor of shape (4,)) for each frame + dict - A dict containing meta information about the sequence, e.g. class of the target object. + + """ + raise NotImplementedError diff --git a/PaddleCV/tracking/ltr/dataset/coco_seq.py b/PaddleCV/tracking/ltr/dataset/coco_seq.py new file mode 100644 index 0000000000000000000000000000000000000000..d55442944dfda2417c50c10fc35c0237b8cc22c8 --- /dev/null +++ b/PaddleCV/tracking/ltr/dataset/coco_seq.py @@ -0,0 +1,130 @@ +import os +from .base_dataset import BaseDataset +from ltr.data.image_loader import default_image_loader +from pycocotools.coco import COCO +from collections import OrderedDict +from ltr.admin.environment import env_settings +import numpy as np + + +class MSCOCOSeq(BaseDataset): + """ The COCO dataset. COCO is an image dataset. Thus, we treat each image as a sequence of length 1. + + Publication: + Microsoft COCO: Common Objects in Context. + Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, + Deva Ramanan, Piotr Dollar and C. Lawrence Zitnick + ECCV, 2014 + https://arxiv.org/pdf/1405.0312.pdf + + Download the images along with annotations from http://cocodataset.org/#download. The root folder should be + organized as follows. + - coco_root + - annotations + - instances_train2014.json + - images + - train2014 + + Note: You also have to install the coco pythonAPI from https://github.com/cocodataset/cocoapi. + """ + + def __init__(self, + root=None, + filter=None, + image_loader=default_image_loader): + root = env_settings().coco_dir if root is None else root + super().__init__(root, image_loader) + self.filter = filter + + # self.img_pth = os.path.join(root, 'train2014/') + self.img_pth = os.path.join(root, 'train2017/') + # self.anno_path = os.path.join(root, 'annotations/instances_train2014.json') + self.anno_path = os.path.join(root, + 'annotations/instances_train2017.json') + + # Load the COCO set. + self.coco_set = COCO(self.anno_path) + + self.cats = self.coco_set.cats + self.sequence_list = self._get_sequence_list() + + def _get_sequence_list(self): + ann_list = list(self.coco_set.anns.keys()) + seq_list = [] + print('COCO before: {}'.format(len(ann_list))) + for a in ann_list: + if self.coco_set.anns[a]['iscrowd'] == 0: + box = self.coco_set.anns[a]['bbox'] + box = np.reshape(np.array(box), (1, 4)) + target_visible = (box[:, 2] > 0) & (box[:, 3] > 0) + if self.filter: + target_large = (box[:, 2] * box[:, 3] > 30 * 30) + ratio = box[:, 2] / box[:, 3] + target_reasonable_ratio = (10 > ratio) & (ratio > 0.1) + target_visible = target_visible & target_large & target_reasonable_ratio + if target_visible: + seq_list.append(a) + print('COCO after: {}'.format(len(seq_list))) + return seq_list + + def is_video_sequence(self): + return False + + def get_name(self): + return 'coco' + + def get_num_sequences(self): + return len(self.sequence_list) + + def get_sequence_info(self, seq_id): + anno = self._get_anno(seq_id) + target_visible = (anno[:, 2] > 0) & (anno[:, 3] > 0) + return anno, target_visible + + def _get_anno(self, seq_id): + anno = self.coco_set.anns[self.sequence_list[seq_id]]['bbox'] + return np.reshape(np.array(anno), (1, 4)) + + def _get_frames(self, seq_id): + path = self.coco_set.loadImgs( + [self.coco_set.anns[self.sequence_list[seq_id]]['image_id']])[0][ + 'file_name'] + img = self.image_loader(os.path.join(self.img_pth, path)) + return img + + def get_meta_info(self, seq_id): + try: + cat_dict_current = self.cats[self.coco_set.anns[self.sequence_list[ + seq_id]]['category_id']] + object_meta = OrderedDict({ + 'object_class': cat_dict_current['name'], + 'motion_class': None, + 'major_class': cat_dict_current['supercategory'], + 'root_class': None, + 'motion_adverb': None + }) + except: + object_meta = OrderedDict({ + 'object_class': None, + 'motion_class': None, + 'major_class': None, + 'root_class': None, + 'motion_adverb': None + }) + return object_meta + + def get_frames(self, seq_id=None, frame_ids=None, anno=None): + # COCO is an image dataset. Thus we replicate the image denoted by seq_id len(frame_ids) times, and return a + # list containing these replicated images. + frame = self._get_frames(seq_id) + + frame_list = [frame.copy() for _ in frame_ids] + + if anno is None: + anno = self._get_anno(seq_id) + + anno_frames = [anno.copy()[0, :] for _ in frame_ids] + + object_meta = self.get_meta_info(seq_id) + + return frame_list, anno_frames, object_meta diff --git a/PaddleCV/tracking/ltr/dataset/got10k.py b/PaddleCV/tracking/ltr/dataset/got10k.py new file mode 100644 index 0000000000000000000000000000000000000000..986e2cf2f8098484eae1afaa2ea2574d41b9bc98 --- /dev/null +++ b/PaddleCV/tracking/ltr/dataset/got10k.py @@ -0,0 +1,183 @@ +import os +import os.path +import numpy as np +import csv +import pandas +from collections import OrderedDict +from .base_dataset import BaseDataset +from ltr.data.image_loader import default_image_loader +from ltr.admin.environment import env_settings + + +class Got10k(BaseDataset): + """ GOT-10k dataset. + + Publication: + GOT-10k: A Large High-Diversity Benchmark for Generic Object Tracking in the Wild + Lianghua Huang, Xin Zhao, and Kaiqi Huang + arXiv:1810.11981, 2018 + https://arxiv.org/pdf/1810.11981.pdf + + Download dataset from http://got-10k.aitestunion.com/downloads + """ + + def __init__(self, + root=None, + filter=None, + image_loader=default_image_loader, + split=None, + seq_ids=None): + """ + args: + root - path to the got-10k training data. Note: This should point to the 'train' folder inside GOT-10k + image_loader (jpeg4py_loader) - The function to read the images. jpeg4py (https://github.com/ajkxyz/jpeg4py) + is used by default. + split - 'train' or 'val'. Note: The validation split here is a subset of the official got-10k train split, + not NOT the official got-10k validation split. To use the official validation split, provide that as + the root folder instead. + seq_ids - List containing the ids of the videos to be used for training. Note: Only one of 'split' or 'seq_ids' + options can be used at the same time. + """ + root = env_settings().got10k_dir if root is None else root + super().__init__(root, image_loader) + + # all folders inside the root + self.sequence_list = self._get_sequence_list() + + if split == 'vot-train': + ltr_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), '..') + with open( + os.path.join(ltr_path, 'data_specs', + 'got10k_prohibited_for_VOT.txt')) as f: + prohibited = [l.strip() for l in f.readlines()] + print('GOT10K before: {}'.format(len(self.sequence_list))) + self.sequence_list = [ + x for x in self.sequence_list if x not in prohibited + ] + print('GOT10K after: {}'.format(len(self.sequence_list))) + else: + # seq_id is the index of the folder inside the got10k root path + if split is not None: + if seq_ids is not None: + raise ValueError('Cannot set both split_name and seq_ids.') + ltr_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), '..') + if split == 'train': + file_path = os.path.join(ltr_path, 'data_specs', + 'got10k_train_split.txt') + elif split == 'val': + file_path = os.path.join(ltr_path, 'data_specs', + 'got10k_val_split.txt') + else: + raise ValueError('Unknown split name.') + seq_ids = pandas.read_csv( + file_path, header=None, squeeze=True, + dtype=np.int64).values.tolist() + elif seq_ids is None: + seq_ids = list(range(0, len(self.sequence_list))) + # self.seq_ids = seq_ids + + self.sequence_list = [self.sequence_list[i] for i in seq_ids] + + self.sequence_meta_info = self._load_meta_info() + self.filter = filter + + def get_name(self): + return 'got10k' + + def _load_meta_info(self): + sequence_meta_info = { + s: self._read_meta(os.path.join(self.root, s)) + for s in self.sequence_list + } + return sequence_meta_info + + def _read_meta(self, seq_path): + try: + with open(os.path.join(seq_path, 'meta_info.ini')) as f: + meta_info = f.readlines() + object_meta = OrderedDict({ + 'object_class': meta_info[5].split(': ')[-1][:-1], + 'motion_class': meta_info[6].split(': ')[-1][:-1], + 'major_class': meta_info[7].split(': ')[-1][:-1], + 'root_class': meta_info[8].split(': ')[-1][:-1], + 'motion_adverb': meta_info[9].split(': ')[-1][:-1] + }) + except: + object_meta = OrderedDict({ + 'object_class': None, + 'motion_class': None, + 'major_class': None, + 'root_class': None, + 'motion_adverb': None + }) + return object_meta + + def _get_sequence_list(self): + with open(os.path.join(self.root, 'list.txt')) as f: + # dir_names = f.readlines() + dir_list = list(csv.reader(f)) + dir_list = [dir_name[0] for dir_name in dir_list] + return dir_list + + def _read_anno(self, seq_path): + anno_file = os.path.join(seq_path, "groundtruth.txt") + gt = pandas.read_csv( + anno_file, + delimiter=',', + header=None, + dtype=np.float32, + na_filter=False, + low_memory=False).values + return np.array(gt) + + def _read_target_visible(self, seq_path, anno): + # Read full occlusion and out_of_view + occlusion_file = os.path.join(seq_path, "absence.label") + cover_file = os.path.join(seq_path, "cover.label") + + with open(occlusion_file, 'r', newline='') as f: + occlusion = np.array([int(v[0]) for v in csv.reader(f)], 'byte') + with open(cover_file, 'r', newline='') as f: + cover = np.array([int(v[0]) for v in csv.reader(f)], 'byte') + + target_visible = ~occlusion & (cover > 0) & (anno[:, 2] > 0) & ( + anno[:, 3] > 0) + + return target_visible + + def _get_sequence_path(self, seq_id): + return os.path.join(self.root, self.sequence_list[seq_id]) + + def get_sequence_info(self, seq_id): + seq_path = self._get_sequence_path(seq_id) + anno = self._read_anno(seq_path) + target_visible = self._read_target_visible(seq_path, anno) + if self.filter: + target_large = (anno[:, 2] * anno[:, 3] > 30 * 30) + ratio = anno[:, 2] / anno[:, 3] + target_reasonable_ratio = (10 > ratio) & (ratio > 0.1) + target_visible = target_visible & target_large & target_reasonable_ratio + return anno, target_visible + + def _get_frame_path(self, seq_path, frame_id): + return os.path.join( + seq_path, '{:08}.jpg'.format(frame_id + 1)) # frames start from 1 + + def _get_frame(self, seq_path, frame_id): + return self.image_loader(self._get_frame_path(seq_path, frame_id)) + + def get_frames(self, seq_id, frame_ids, anno=None): + seq_path = self._get_sequence_path(seq_id) + obj_meta = self.sequence_meta_info[self.sequence_list[seq_id]] + + frame_list = [self._get_frame(seq_path, f_id) for f_id in frame_ids] + + if anno is None: + anno = self._read_anno(seq_path) + + # Return as list of tensors + anno_frames = [anno[f_id, :] for f_id in frame_ids] + + return frame_list, anno_frames, obj_meta diff --git a/PaddleCV/tracking/ltr/dataset/imagenetvid.py b/PaddleCV/tracking/ltr/dataset/imagenetvid.py new file mode 100644 index 0000000000000000000000000000000000000000..15b5ae0a70996a22944ea68946b81e492789ccd2 --- /dev/null +++ b/PaddleCV/tracking/ltr/dataset/imagenetvid.py @@ -0,0 +1,201 @@ +import os +import numpy as np +from .base_dataset import BaseDataset +from ltr.data.image_loader import default_image_loader +import xml.etree.ElementTree as ET +import json +from collections import OrderedDict +import nltk +from nltk.corpus import wordnet +from ltr.admin.environment import env_settings + + +def get_target_to_image_ratio(seq): + anno = np.array(seq['anno']) + img_sz = np.array(seq['image_size']) + return np.sqrt(anno[0, 2:4].prod() / (img_sz.prod())) + + +class ImagenetVID(BaseDataset): + """ Imagenet VID dataset. + + Publication: + ImageNet Large Scale Visual Recognition Challenge + Olga Russakovsky, Jia Deng, Hao Su, Jonathan Krause, Sanjeev Satheesh, Sean Ma, Zhiheng Huang, Andrej Karpathy, + Aditya Khosla, Michael Bernstein, Alexander C. Berg and Li Fei-Fei + IJCV, 2015 + https://arxiv.org/pdf/1409.0575.pdf + + Download the dataset from http://image-net.org/ + """ + + def __init__(self, + root=None, + filter=None, + image_loader=default_image_loader, + min_length=0, + max_target_area=1): + """ + args: + root - path to the imagenet vid dataset. + image_loader (jpeg4py_loader) - The function to read the images. jpeg4py (https://github.com/ajkxyz/jpeg4py) + is used by default. + min_length - Minimum allowed sequence length. + max_target_area - max allowed ratio between target area and image area. Can be used to filter out targets + which cover complete image. + """ + root = env_settings().imagenet_dir if root is None else root + super().__init__(root, image_loader) + + cache_file = os.path.join(root, 'cache.json') + if os.path.isfile(cache_file): + # If available, load the pre-processed cache file containing meta-info for each sequence + with open(cache_file, 'r') as f: + sequence_list_dict = json.load(f) + + self.sequence_list = sequence_list_dict + else: + # Else process the imagenet annotations and generate the cache file + self.sequence_list = self._process_anno(root) + + with open(cache_file, 'w') as f: + json.dump(self.sequence_list, f) + + # Filter the sequences based on min_length and max_target_area in the first frame + self.sequence_list = [ + x for x in self.sequence_list + if len(x['anno']) >= min_length and get_target_to_image_ratio(x) < + max_target_area + ] + self.filter = filter + + def get_name(self): + return 'imagenetvid' + + def get_num_sequences(self): + return len(self.sequence_list) + + def get_sequence_info(self, seq_id): + anno = np.array(self.sequence_list[seq_id]['anno']) + target_visible = np.array(self.sequence_list[seq_id]['target_visible'], + 'bool') + target_visible = target_visible & (anno[:, 2] > 0) & (anno[:, 3] > 0) + if self.filter is not None: + target_large = (anno[:, 2] * anno[:, 3] > 30 * 30) + ratio = anno[:, 2] / anno[:, 3] + target_reasonable_ratio = (10 > ratio) & (ratio > 0.1) + target_visible = target_visible & target_reasonable_ratio & target_large + return anno, target_visible + + def _get_frame(self, sequence, frame_id): + set_name = 'ILSVRC2015_VID_train_{:04d}'.format(sequence['set_id']) + vid_name = 'ILSVRC2015_train_{:08d}'.format(sequence['vid_id']) + frame_number = frame_id + sequence['start_frame'] + + frame_path = os.path.join(self.root, 'Data', 'VID', 'train', set_name, + vid_name, '{:06d}.JPEG'.format(frame_number)) + # frame_path = os.path.join(self.root, 'Data', 'VID', 'train', vid_name, + # '{:06d}.jpg'.format(frame_number)) + return self.image_loader(frame_path) + + def get_frames(self, seq_id, frame_ids, anno=None): + sequence = self.sequence_list[seq_id] + + frame_list = [self._get_frame(sequence, f) for f in frame_ids] + + if anno is None: + anno = sequence['anno'] + + # Return as list of tensors + anno_frames = [anno[f_id, :] for f_id in frame_ids] + + # added the class info to the meta info + object_meta = OrderedDict({ + 'object_class': sequence['class_name'], + 'motion_class': None, + 'major_class': None, + 'root_class': None, + 'motion_adverb': None + }) + + return frame_list, anno_frames, object_meta + + def _process_anno(self, root): + # Builds individual tracklets + base_vid_anno_path = os.path.join(root, 'Annotations', 'VID', 'train') + + all_sequences = [] + # for set in sorted(os.listdir(base_vid_anno_path)): + for set in sorted([ + 'ILSVRC2015_VID_train_0000', 'ILSVRC2015_VID_train_0001', + 'ILSVRC2015_VID_train_0002', 'ILSVRC2015_VID_train_0003' + ]): + set_id = int(set.split('_')[-1]) + for vid in sorted( + os.listdir(os.path.join(base_vid_anno_path, set))): + + vid_id = int(vid.split('_')[-1]) + anno_files = sorted( + os.listdir(os.path.join(base_vid_anno_path, set, vid))) + + frame1_anno = ET.parse( + os.path.join(base_vid_anno_path, set, vid, anno_files[0])) + image_size = [ + int(frame1_anno.find('size/width').text), + int(frame1_anno.find('size/height').text) + ] + + objects = [ + ET.ElementTree(file=os.path.join(base_vid_anno_path, set, + vid, f)).findall('object') + for f in anno_files + ] + + tracklets = {} + + # Find all tracklets along with start frame + for f_id, all_targets in enumerate(objects): + for target in all_targets: + tracklet_id = target.find('trackid').text + if tracklet_id not in tracklets: + tracklets[tracklet_id] = f_id + + for tracklet_id, tracklet_start in tracklets.items(): + tracklet_anno = [] + target_visible = [] + class_name = None + + for f_id in range(tracklet_start, len(objects)): + found = False + for target in objects[f_id]: + if target.find('trackid').text == tracklet_id: + if not class_name: + class_name_id = target.find('name').text + class_name = class_name_id + # class_name = self._get_class_name_from_id(class_name_id) + x1 = int(target.find('bndbox/xmin').text) + y1 = int(target.find('bndbox/ymin').text) + x2 = int(target.find('bndbox/xmax').text) + y2 = int(target.find('bndbox/ymax').text) + + tracklet_anno.append([x1, y1, x2 - x1, y2 - y1]) + target_visible.append( + target.find('occluded').text == '0') + + found = True + break + if not found: + break + + new_sequence = { + 'set_id': set_id, + 'vid_id': vid_id, + 'class_name': class_name, + 'start_frame': tracklet_start, + 'anno': tracklet_anno, + 'target_visible': target_visible, + 'image_size': image_size + } + all_sequences.append(new_sequence) + + return all_sequences diff --git a/PaddleCV/tracking/ltr/dataset/lasot.py b/PaddleCV/tracking/ltr/dataset/lasot.py new file mode 100644 index 0000000000000000000000000000000000000000..11c5a3f173fed4b38dc610cd5c936c65ef532a1a --- /dev/null +++ b/PaddleCV/tracking/ltr/dataset/lasot.py @@ -0,0 +1,152 @@ +import os +import os.path +import numpy as np +import pandas +import csv +from collections import OrderedDict +from .base_dataset import BaseDataset +from ltr.data.image_loader import default_image_loader +from ltr.admin.environment import env_settings + + +class Lasot(BaseDataset): + """ LaSOT dataset. + + Publication: + LaSOT: A High-quality Benchmark for Large-scale Single Object Tracking + Heng Fan, Liting Lin, Fan Yang, Peng Chu, Ge Deng, Sijia Yu, Hexin Bai, Yong Xu, Chunyuan Liao and Haibin Ling + CVPR, 2019 + https://arxiv.org/pdf/1809.07845.pdf + + Download the dataset from https://cis.temple.edu/lasot/download.html + """ + + def __init__(self, + root=None, + filter=None, + image_loader=default_image_loader, + vid_ids=None, + split=None): + """ + args: + root - path to the lasot dataset. + image_loader (jpeg4py_loader) - The function to read the images. jpeg4py (https://github.com/ajkxyz/jpeg4py) + is used by default. + vid_ids - List containing the ids of the videos (1 - 20) used for training. If vid_ids = [1, 3, 5], then the + videos with subscripts -1, -3, and -5 from each class will be used for training. + split - If split='train', the official train split (protocol-II) is used for training. Note: Only one of + vid_ids or split option can be used at a time. + """ + root = env_settings().lasot_dir if root is None else root + super().__init__(root, image_loader) + + self.sequence_list = self._build_sequence_list(vid_ids, split) + self.filter = filter + + def _build_sequence_list(self, vid_ids=None, split=None): + if split is not None: + if vid_ids is not None: + raise ValueError('Cannot set both split_name and vid_ids.') + ltr_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), '..') + if split == 'train': + file_path = os.path.join(ltr_path, 'data_specs', + 'lasot_train_split.txt') + else: + raise ValueError('Unknown split name.') + sequence_list = pandas.read_csv( + file_path, header=None, squeeze=True).values.tolist() + elif vid_ids is not None: + sequence_list = [ + c + '-' + str(v) for c in self.class_list for v in vid_ids + ] + else: + raise ValueError('Set either split_name or vid_ids.') + + return sequence_list + + def get_name(self): + return 'lasot' + + def get_num_sequences(self): + return len(self.sequence_list) + + def _read_anno(self, seq_path): + anno_file = os.path.join(seq_path, "groundtruth.txt") + gt = pandas.read_csv( + anno_file, + delimiter=',', + header=None, + dtype=np.float32, + na_filter=False, + low_memory=False).values + return np.array(gt) + + def _read_target_visible(self, seq_path, anno): + # Read full occlusion and out_of_view + occlusion_file = os.path.join(seq_path, "full_occlusion.txt") + out_of_view_file = os.path.join(seq_path, "out_of_view.txt") + + with open(occlusion_file, 'r', newline='') as f: + occlusion = np.array([int(v) for v in list(csv.reader(f))[0]], + 'byte') + with open(out_of_view_file, 'r') as f: + out_of_view = np.array([int(v) for v in list(csv.reader(f))[0]], + 'byte') + + target_visible = ~occlusion & ~out_of_view & (anno[:, 2] > 0) & ( + anno[:, 3] > 0) + + return target_visible + + def _get_sequence_path(self, seq_id): + seq_name = self.sequence_list[seq_id] + class_name = seq_name.split('-')[0] + vid_id = seq_name.split('-')[1] + + return os.path.join(self.root, class_name, class_name + '-' + vid_id) + + def get_sequence_info(self, seq_id): + seq_path = self._get_sequence_path(seq_id) + anno = self._read_anno(seq_path) + target_visible = self._read_target_visible(seq_path, anno) + if self.filter is not None: + target_large = (anno[:, 2] * anno[:, 3] > 30 * 30) + ratio = anno[:, 2] / anno[:, 3] + target_reasonable_ratio = (10 > ratio) & (ratio > 0.1) + target_visible = target_visible & target_reasonable_ratio & target_large + return anno, target_visible + + def _get_frame_path(self, seq_path, frame_id): + return os.path.join( + seq_path, 'img', + '{:08}.jpg'.format(frame_id + 1)) # frames start from 1 + + def _get_frame(self, seq_path, frame_id): + return self.image_loader(self._get_frame_path(seq_path, frame_id)) + + def _get_class(self, seq_path): + obj_class = seq_path.split('/')[-2] + return obj_class + + def get_frames(self, seq_id, frame_ids, anno=None): + seq_path = self._get_sequence_path(seq_id) + + obj_class = self._get_class(seq_path) + frame_list = [self._get_frame(seq_path, f_id) for f_id in frame_ids] + + if anno is None: + anno = self._read_anno(seq_path) + + # Return as list of tensors + anno_frames = [anno[f_id, :] for f_id in frame_ids] + + object_meta = OrderedDict({ + 'object_class': obj_class, + 'motion_class': None, + 'major_class': None, + 'root_class': None, + 'motion_adverb': None + }) + + return frame_list, anno_frames, object_meta diff --git a/PaddleCV/tracking/ltr/dataset/tracking_net.py b/PaddleCV/tracking/ltr/dataset/tracking_net.py new file mode 100644 index 0000000000000000000000000000000000000000..62f5cb808f54069f76d60ec59b84d22815deb9d5 --- /dev/null +++ b/PaddleCV/tracking/ltr/dataset/tracking_net.py @@ -0,0 +1,117 @@ +import os +import os.path +import numpy as np +import pandas +from collections import OrderedDict + +from ltr.data.image_loader import default_image_loader +from .base_dataset import BaseDataset +from ltr.admin.environment import env_settings + + +def list_sequences(root, set_ids): + """ Lists all the videos in the input set_ids. Returns a list of tuples (set_id, video_name) + + args: + root: Root directory to TrackingNet + set_ids: Sets (0-11) which are to be used + + returns: + list - list of tuples (set_id, video_name) containing the set_id and video_name for each sequence + """ + sequence_list = [] + + for s in set_ids: + anno_dir = os.path.join(root, "TRAIN_" + str(s), "anno") + + sequences_cur_set = [(s, os.path.splitext(f)[0]) + for f in os.listdir(anno_dir) + if f.endswith('.txt')] + sequence_list += sequences_cur_set + + return sequence_list + + +class TrackingNet(BaseDataset): + """ TrackingNet dataset. + + Publication: + TrackingNet: A Large-Scale Dataset and Benchmark for Object Tracking in the Wild. + Matthias Mueller,Adel Bibi, Silvio Giancola, Salman Al-Subaihi and Bernard Ghanem + ECCV, 2018 + https://ivul.kaust.edu.sa/Documents/Publications/2018/TrackingNet%20A%20Large%20Scale%20Dataset%20and%20Benchmark%20for%20Object%20Tracking%20in%20the%20Wild.pdf + + Download the dataset using the toolkit https://github.com/SilvioGiancola/TrackingNet-devkit. + """ + + def __init__(self, + root=None, + image_loader=default_image_loader, + set_ids=None): + """ + args: + root - The path to the TrackingNet folder, containing the training sets. + image_loader (jpeg4py_loader) - The function to read the images. jpeg4py (https://github.com/ajkxyz/jpeg4py) + is used by default. + set_ids (None) - List containing the ids of the TrackingNet sets to be used for training. If None, all the + sets (0 - 11) will be used. + """ + root = env_settings().trackingnet_dir if root is None else root + super().__init__(root, image_loader) + + if set_ids is None: + set_ids = [i for i in range(12)] + + self.set_ids = set_ids + + # Keep a list of all videos. Sequence list is a list of tuples (set_id, video_name) containing the set_id and + # video_name for each sequence + self.sequence_list = list_sequences(self.root, self.set_ids) + + def get_name(self): + return 'trackingnet' + + def _read_anno(self, seq_id): + set_id = self.sequence_list[seq_id][0] + vid_name = self.sequence_list[seq_id][1] + anno_file = os.path.join(self.root, "TRAIN_" + str(set_id), "anno", + vid_name + ".txt") + gt = pandas.read_csv( + anno_file, + delimiter=',', + header=None, + dtype=np.float32, + na_filter=False, + low_memory=False).values + return np.array(gt) + + def get_sequence_info(self, seq_id): + anno = self._read_anno(seq_id) + target_visible = (anno[:, 2] > 0) & (anno[:, 3] > 0) + return anno, target_visible + + def _get_frame(self, seq_id, frame_id): + set_id = self.sequence_list[seq_id][0] + vid_name = self.sequence_list[seq_id][1] + frame_path = os.path.join(self.root, "TRAIN_" + str(set_id), "frames", + vid_name, str(frame_id) + ".jpg") + return self.image_loader(frame_path) + + def get_frames(self, seq_id, frame_ids, anno=None): + frame_list = [self._get_frame(seq_id, f) for f in frame_ids] + + if anno is None: + anno = self._read_anno(seq_id) + + # Return as list of tensors + anno_frames = [anno[f_id, :] for f_id in frame_ids] + + object_meta = OrderedDict({ + 'object_class': None, + 'motion_class': None, + 'major_class': None, + 'root_class': None, + 'motion_adverb': None + }) + + return frame_list, anno_frames, object_meta diff --git a/PaddleCV/tracking/ltr/dataset/vot.py b/PaddleCV/tracking/ltr/dataset/vot.py new file mode 100644 index 0000000000000000000000000000000000000000..3720a0585642af8b7b1deadea8009d17f977869c --- /dev/null +++ b/PaddleCV/tracking/ltr/dataset/vot.py @@ -0,0 +1,140 @@ +import os +from .base_dataset import BaseDataset +from ltr.data.image_loader import default_image_loader +import numpy as np +import cv2 as cv +from collections import OrderedDict +from ltr.admin.environment import env_settings + + +def get_axis_aligned_bbox(region): + region = np.array(region) + if len(region.shape) == 3: + # region (1,4,2) + region = np.array([ + region[0][0][0], region[0][0][1], region[0][1][0], region[0][1][1], + region[0][2][0], region[0][2][1], region[0][3][0], region[0][3][1] + ]) + + cx = np.mean(region[0::2]) + cy = np.mean(region[1::2]) + x1 = min(region[0::2]) + + x2 = max(region[0::2]) + y1 = min(region[1::2]) + y2 = max(region[1::2]) + + A1 = np.linalg.norm(region[0:2] - region[2:4]) * np.linalg.norm(region[ + 2:4] - region[4:6]) + A2 = (x2 - x1) * (y2 - y1) + s = np.sqrt(A1 / A2) + w = s * (x2 - x1) + 1 + h = s * (y2 - y1) + 1 + + x11 = cx - w // 2 + y11 = cy - h // 2 + + return x11, y11, w, h + + +class VOT(BaseDataset): + def __init__(self, root=None, image_loader=default_image_loader): + # root = env_settings().vot_dir if root is None else root + assert root is not None + super().__init__(root, image_loader) + + self.sequence_list = self._get_sequence_list() + self.ann = self._get_annotations() + + def _get_sequence_list(self): + seq_list = [] + for d in os.listdir(self.root): + if os.path.isdir(os.path.join(self.root, d)): + seq_list.append(d) + return sorted(seq_list) + + def _get_annotations(self): + ann = {} + for seq in self.sequence_list: + ann[seq] = {'bbox': [], 'rbb': []} + with open(os.path.join(self.root, seq, 'groundtruth.txt')) as f: + lines = [l.strip().split(',') for l in f.readlines()] + for l in lines: + vs = [float(v) for v in l] + if len(vs) == 4: + polys = [ + vs[0], vs[1] + vs[3] - 1, vs[0], vs[1], + vs[0] + vs[2] - 1, vs[1], vs[0] + vs[2] - 1, + vs[1] + vs[3] - 1 + ] + else: + polys = vs + + box = get_axis_aligned_bbox(polys) + rbb = cv.minAreaRect( + np.int0(np.array(polys).reshape((-1, 2)))) + # assume small rotation angle, switch height, width + if rbb[2] < -45: + angle = rbb[2] + 90 + height = rbb[1][0] + width = rbb[1][1] + else: + angle = rbb[2] + height = rbb[1][1] + width = rbb[1][0] + rbb = [rbb[0][0], rbb[0][1], width, height, angle] + ann[seq]['bbox'].append(box) + ann[seq]['rbb'].append(rbb) + return ann + + def is_video_sequence(self): + return True + + def get_name(self): + return 'vot' + + def get_num_sequences(self): + return len(self.sequence_list) + + def get_sequence_info(self, seq_id): + anno = self._get_anno(seq_id) + target_visible = (anno[:, 2] > 0) & (anno[:, 3] > 0) + return anno, target_visible + + def _get_anno(self, seq_id): + anno = self.ann[self.sequence_list[seq_id]]['bbox'] + return np.reshape(np.array(anno), (-1, 4)) + + def get_meta_info(self, seq_id): + object_meta = OrderedDict({ + 'object_class': None, + 'motion_class': None, + 'major_class': None, + 'root_class': None, + 'motion_adverb': None + }) + return object_meta + + def _get_sequence_path(self, seq_id): + return os.path.join(self.root, self.sequence_list[seq_id]) + + def _get_frame_path(self, seq_path, frame_id): + return os.path.join( + seq_path, 'color', + '{:08}.jpg'.format(frame_id + 1)) # frames start from 1 + + def _get_frame(self, seq_path, frame_id): + return self.image_loader(self._get_frame_path(seq_path, frame_id)) + + def get_frames(self, seq_id=None, frame_ids=None, anno=None): + seq_path = self._get_sequence_path(seq_id) + frame_list = [self._get_frame(seq_path, f_id) for f_id in frame_ids] + + if anno is None: + anno = self._get_anno(seq_id) + + anno_frames = [anno[f_id, :] for f_id in frame_ids] + + object_meta = self.get_meta_info(seq_id) + + return frame_list, anno_frames, object_meta diff --git a/PaddleCV/tracking/ltr/dataset/youtube_bb.py b/PaddleCV/tracking/ltr/dataset/youtube_bb.py new file mode 100644 index 0000000000000000000000000000000000000000..5628c5714d50976b6718874776a2cd01403ec8e2 --- /dev/null +++ b/PaddleCV/tracking/ltr/dataset/youtube_bb.py @@ -0,0 +1,114 @@ +import os +from .base_dataset import BaseDataset +from ltr.data.image_loader import default_image_loader +import xml.etree.ElementTree as ET +import json +import pickle +from collections import OrderedDict +import numpy as np +import nltk +from nltk.corpus import wordnet +from ltr.admin.environment import env_settings + + +def get_target_to_image_ratio(seq): + anno = np.array(seq['anno']) + img_sz = np.array(seq['image_size']) + return np.sqrt(anno[0, 2:4].prod() / (img_sz.prod())) + + +class YoutubeBB(BaseDataset): + """ YoutubeBB dataset. + + Publication: + ImageNet Large Scale Visual Recognition Challenge + Olga Russakovsky, Jia Deng, Hao Su, Jonathan Krause, Sanjeev Satheesh, Sean Ma, Zhiheng Huang, Andrej Karpathy, + Aditya Khosla, Michael Bernstein, Alexander C. Berg and Li Fei-Fei + IJCV, 2015 + https://arxiv.org/pdf/1409.0575.pdf + + Download the dataset from http://image-net.org/ + """ + + def __init__(self, + root=None, + filter=None, + image_loader=default_image_loader, + min_length=0, + max_target_area=1): + """ + args: + root - path to the imagenet vid dataset. + image_loader (jpeg4py_loader) - The function to read the images. jpeg4py (https://github.com/ajkxyz/jpeg4py) + is used by default. + min_length - Minimum allowed sequence length. + max_target_area - max allowed ratio between target area and image area. Can be used to filter out targets + which cover complete image. + """ + super().__init__(root, image_loader) + + meta_file = os.path.join(root, 'ytb_meta.pickle') + with open(meta_file, 'rb') as f: + meta = pickle.load(f) + + sequence_list = [] + for video_name, video_info in meta: + if 'ILSVRC' not in video_name: + seq_info = {} + for trkid in video_info: + if len(video_info[trkid]['img']) > 2: + seq_info['video_name'] = video_name + seq_info['anno'] = video_info[trkid]['box'] + seq_info['img_paths'] = video_info[trkid]['img'] + sequence_list.append(seq_info) + + print('num_sequences: {}'.format(len(sequence_list))) + self.sequence_list = sequence_list + + # Filter the sequences based on min_length and max_target_area in the first frame + # self.sequence_list = [x for x in self.sequence_list if len(x['anno']) >= min_length and + # get_target_to_image_ratio(x) < max_target_area] + self.filter = filter + + def get_name(self): + return 'youtubebb' + + def get_num_sequences(self): + return len(self.sequence_list) + + def get_sequence_info(self, seq_id): + anno = np.array(self.sequence_list[seq_id]['anno']) + target_visible = (anno[:, 2] > 0) & (anno[:, 3] > 0) + if self.filter is not None: + target_large = (anno[:, 2] * anno[:, 3] > 30 * 30) + target_resonable = (anno[:, 2] * anno[:, 3] < 500 * 500) + ratio = anno[:, 2] / anno[:, 3] + target_reasonable_ratio = (10 > ratio) & (ratio > 0.1) + target_visible = target_visible & target_reasonable_ratio & target_large & target_resonable + return anno, target_visible + + def _get_frame(self, sequence, frame_id): + frame_path = os.path.join(self.root, sequence['video_name'], + sequence['img_paths'][frame_id] + '.jpg') + return self.image_loader(frame_path) + + def get_frames(self, seq_id, frame_ids, anno=None): + sequence = self.sequence_list[seq_id] + frame_list = [self._get_frame(sequence, f) for f in frame_ids] + + if anno is None: + anno = sequence['anno'] + + # Return as list of tensors + anno_frames = [anno[f_id, :] for f_id in frame_ids] + + # added the class info to the meta info + object_meta = OrderedDict({ + 'object_class': None, + 'motion_class': None, + 'major_class': None, + 'root_class': None, + 'motion_adverb': None + }) + + return frame_list, anno_frames, object_meta diff --git a/PaddleCV/tracking/ltr/dataset/youtube_vos.py b/PaddleCV/tracking/ltr/dataset/youtube_vos.py new file mode 100644 index 0000000000000000000000000000000000000000..f884272f4bab666b740cc80461de7056ec39ed96 --- /dev/null +++ b/PaddleCV/tracking/ltr/dataset/youtube_vos.py @@ -0,0 +1,152 @@ +import os +from .base_dataset import BaseDataset +from ltr.data.image_loader import default_image_loader +import numpy as np +import cv2 as cv +import json +from collections import OrderedDict +from ltr.admin.environment import env_settings + + +def get_axis_aligned_bbox(region): + region = np.array(region) + if len(region.shape) == 3: + # region (1,4,2) + region = np.array([ + region[0][0][0], region[0][0][1], region[0][1][0], region[0][1][1], + region[0][2][0], region[0][2][1], region[0][3][0], region[0][3][1] + ]) + + cx = np.mean(region[0::2]) + cy = np.mean(region[1::2]) + x1 = min(region[0::2]) + + x2 = max(region[0::2]) + y1 = min(region[1::2]) + y2 = max(region[1::2]) + + A1 = np.linalg.norm(region[0:2] - region[2:4]) * np.linalg.norm(region[ + 2:4] - region[4:6]) + A2 = (x2 - x1) * (y2 - y1) + s = np.sqrt(A1 / A2) + if s is np.nan: + x11, y11, w, h = 0, 0, 0, 0 + else: + w = s * (x2 - x1) + 1 + h = s * (y2 - y1) + 1 + + x11 = cx - w // 2 + y11 = cy - h // 2 + return x11, y11, w, h + + +class VOS(BaseDataset): + def __init__(self, root=None, image_loader=default_image_loader): + # root = env_settings().vot_dir if root is None else root + assert root is not None + super().__init__(root, image_loader) + + with open(os.path.join(self.root, 'meta.json')) as f: + self.meta = json.load(f)['videos'] + + self.sequence_list = self._get_sequence_list() + self.ann = self._get_annotations() + + def _get_sequence_list(self): + seq_list = [] + videos = self.meta.keys() + for v in videos: + objs = self.meta[v]['objects'].keys() + for o in objs: + if "rotate_box" in self.meta[v]['objects'][o]: + seq_list.append((v, o)) + assert len(seq_list) > 0 + return seq_list + + def _get_annotations(self): + ann = {} + for seq in self.sequence_list: + ann[seq] = {'bbox': [], 'rbb': []} + polygons = self.meta[seq[0]]['objects'][seq[1]]['rotate_box'] + for vs in polygons: + if len(vs) == 4: + polys = [ + vs[0], vs[1] + vs[3] - 1, vs[0], vs[1], + vs[0] + vs[2] - 1, vs[1], vs[0] + vs[2] - 1, + vs[1] + vs[3] - 1 + ] + else: + polys = vs + if not np.all(polys == 0): + box = get_axis_aligned_bbox(polys) + rbb = cv.minAreaRect( + np.int0(np.array(polys).reshape((-1, 2)))) + else: + box = np.array([0, 0, 0, 0]) + rbb = ((0, 0), (0, 0), 0) + if box[2] * box[3] > 500 * 500: + print(box) + # assume small rotation angle, switch height, width + if rbb[2] < -45: + angle = rbb[2] + 90 + height = rbb[1][0] + width = rbb[1][1] + else: + angle = rbb[2] + height = rbb[1][1] + width = rbb[1][0] + rbb = [rbb[0][0], rbb[0][1], width, height, angle] + ann[seq]['bbox'].append(box) + ann[seq]['rbb'].append(rbb) + return ann + + def is_video_sequence(self): + return True + + def get_name(self): + return 'vot' + + def get_num_sequences(self): + return len(self.sequence_list) + + def get_sequence_info(self, seq_id): + anno = self._get_anno(seq_id) + target_visible = (anno[:, 2] > 0) & (anno[:, 3] > 0) + target_large = (anno[:, 2] * anno[:, 3] > 30 * 30) + target_resonable = (anno[:, 2] * anno[:, 3] < 500 * 500) + return anno, target_visible & target_large & target_resonable + + def _get_anno(self, seq_id): + anno = self.ann[self.sequence_list[seq_id]]['bbox'] + return np.reshape(np.array(anno), (-1, 4)) + + def get_meta_info(self, seq_id): + object_meta = OrderedDict({ + 'object_class': None, + 'motion_class': None, + 'major_class': None, + 'root_class': None, + 'motion_adverb': None + }) + return object_meta + + def _get_frame_path(self, seq_id, frame_id): + v, o = self.sequence_list[seq_id] + frame_name = self.meta[v]['objects'][o]['frames'][frame_id] + return os.path.join(self.root, 'JPEGImages', v, + '{}.jpg'.format(frame_name)) # frames start from 1 + + def _get_frame(self, seq_id, frame_id): + return self.image_loader(self._get_frame_path(seq_id, frame_id)) + + def get_frames(self, seq_id=None, frame_ids=None, anno=None): + frame_list = [self._get_frame(seq_id, f_id) for f_id in frame_ids] + + if anno is None: + anno = self._get_anno(seq_id) + + anno_frames = [anno[f_id, :] for f_id in frame_ids] + + object_meta = self.get_meta_info(seq_id) + + return frame_list, anno_frames, object_meta diff --git a/PaddleCV/tracking/ltr/models/backbone/resnet.py b/PaddleCV/tracking/ltr/models/backbone/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..a576ad577f64499cafabc4c2185d84e93130c027 --- /dev/null +++ b/PaddleCV/tracking/ltr/models/backbone/resnet.py @@ -0,0 +1,322 @@ +import os + +import paddle.fluid as fluid +import paddle.fluid.dygraph.nn as nn +from ltr.admin.environment import env_settings + +CURRENT_DIR = os.path.dirname(__file__) + + +def weight_init(): + init = fluid.initializer.MSRAInitializer(uniform=False) + param = fluid.ParamAttr(initializer=init) + return param + + +def norm_weight_init(constant=1.0): + init = fluid.initializer.ConstantInitializer(constant) + param = fluid.ParamAttr(initializer=init) + return param + + +def norm_bias_init(): + init = fluid.initializer.ConstantInitializer(value=0.) + param = fluid.ParamAttr(initializer=init) + return param + + +class ConvBNLayer(fluid.dygraph.Layer): + def __init__(self, + in_channels, + out_channels, + filter_size, + stride=1, + groups=1, + bn_init_constant=1.0, + is_test=False): + super(ConvBNLayer, self).__init__() + + self.conv = nn.Conv2D( + num_channels=in_channels, + filter_size=filter_size, + num_filters=out_channels, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + bias_attr=False, + param_attr=weight_init()) + self.bn = nn.BatchNorm( + out_channels, + param_attr=norm_weight_init(bn_init_constant), + bias_attr=norm_bias_init(), + act=None, + momentum=0.9, + use_global_stats=is_test) + + def forward(self, inputs): + res = self.conv(inputs) + self.conv_res = res + res = self.bn(res) + return res + + +class BasicBlock(fluid.dygraph.Layer): + expansion = 1 + + def __init__(self, + in_channels, + out_channels, + stride=1, + is_downsample=None, + is_test=False): + + super(BasicBlock, self).__init__() + self.expansion = 1 + + self.conv_bn1 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + filter_size=3, + stride=stride, + groups=1, + is_test=is_test) + self.conv_bn2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + filter_size=3, + stride=1, + groups=1, + is_test=is_test) + + self.is_downsample = is_downsample + if self.is_downsample: + self.downsample = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + filter_size=1, + stride=stride, + is_test=is_test) + self.stride = stride + + def forward(self, inputs): + identity = inputs + res = self.conv_bn1(inputs) + res = fluid.layers.relu(res) + + res = self.conv_bn2(res) + + if self.is_downsample: + identity = self.downsample(identity) + + res += identity + res = fluid.layers.relu(res) + return res + + +class Bottleneck(fluid.dygraph.Layer): + expansion = 4 + + def __init__(self, + in_channels, + out_channels, + stride=1, + is_downsample=None, + base_width=64, + dilation=1, + groups=1, + is_test=False): + super(Bottleneck, self).__init__() + + width = int(out_channels * (base_width / 64.)) * groups + + self.conv_bn1 = ConvBNLayer( + in_channels=in_channels, + filter_size=1, + out_channels=width, + groups=1, + is_test=is_test) + self.conv_bn2 = ConvBNLayer( + in_channels=width, + filter_size=3, + out_channels=width, + stride=stride, + groups=groups, + is_test=is_test) + self.conv_bn3 = ConvBNLayer( + in_channels=width, + filter_size=1, + out_channels=out_channels * self.expansion, + bn_init_constant=0., + is_test=is_test) + self.is_downsample = is_downsample + if self.is_downsample: + self.downsample = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * self.expansion, + filter_size=1, + stride=stride, + is_test=is_test) + + self.stride = stride + + def forward(self, inputs): + identify = inputs + + out = self.conv_bn1(inputs) + out = fluid.layers.relu(out) + + out = self.conv_bn2(out) + out = fluid.layers.relu(out) + + out = self.conv_bn3(out) + + if self.is_downsample: + identify = self.downsample(inputs) + + out += identify + out = fluid.layers.relu(out) + return out + + +class ResNet(fluid.dygraph.Layer): + def __init__(self, + name, + Block, + layers, + num_classes=1000, + groups=1, + is_test=False): + """ + + :param name: str, namescope + :param layers: int, the layer of defined network + :param num_classes: int, the dimension of final output + :param groups: int, default is 1 + """ + super(ResNet, self).__init__(name_scope=name) + + support_layers = [18, 34, 50, 101, 152] + assert layers in support_layers, \ + "support layer can only be one of [18, 34, 50, 101, 152]" + self.layers = layers + + if layers == 18: + depths = [2, 2, 2, 2] + elif layers == 50 or layers == 34: + depths = [3, 4, 6, 3] + elif layers == 101: + depths = [3, 4, 23, 3] + elif layers == 152: + depths = [3, 8, 36, 3] + + strides = [1, 2, 2, 2] + num_filters = [64, 128, 256, 512] + + self.in_channels = 64 + self.dilation = 1 + self.groups = groups + + self.conv_bn_init = ConvBNLayer( + 3, + out_channels=self.in_channels, + filter_size=7, + stride=2, + is_test=is_test) + + block_collect = [] + downsample = None + for i in range(len(depths)): + # collect layers in each block + _block = [] + + stride = strides[i] + out_channel = num_filters[i] + + if stride != 1 or self.in_channels != num_filters[ + i] * Block.expansion: + downsample = True + bottleneck_block = self.add_sublayer( + "block{}_0".format(i), + Block( + self.in_channels, + out_channel, + stride=stride, + is_downsample=downsample, + is_test=is_test)) + + downsample = False + + _block.append(bottleneck_block) + + self.in_channels = num_filters[i] * Block.expansion + + for j in range(1, depths[i]): + bottleneck_block = self.add_sublayer( + "block{}_{}".format(i, j), + Block( + self.in_channels, out_channel, is_test=is_test)) + _block.append(bottleneck_block) + + # collect blocks + block_collect.append(_block) + + self.block_collect = block_collect + + self.maxpool = nn.Pool2D( + pool_size=3, pool_stride=2, pool_padding=1, pool_type="max") + + self.global_pool = nn.Pool2D(pool_type='avg', global_pooling=True) + self.fc = nn.Linear( + input_dim=512 * Block.expansion, output_dim=num_classes) + + def _add_output_and_check(self, name, x, outputs, output_layers): + if name in output_layers: + outputs[name] = x + return len(output_layers) == len(outputs) + + def forward(self, inputs, feat_layers): + out = {} + res = self.conv_bn_init(inputs) + res = fluid.layers.relu(res) + res = self.maxpool(res) + + # out['conv_init'] = res + for i in range(len(self.block_collect)): + + for layer in self.block_collect[i]: + res = layer(res) + + name = 'block{}'.format(i) + if name in feat_layers: + out[name] = res + if len(out) == len(feat_layers): + return out + + res = self.global_pool(res) + B, C, _, _ = res.shape + res = fluid.layers.reshape(res, [B, C]) + res = self.fc(res) + out['fc'] = res + return out + + +def resnet18(name, is_test=False, pretrained=False): + net = ResNet(name, Block=BasicBlock, layers=18, is_test=is_test) + if pretrained: + params_path = os.path.join(env_settings().backbone_dir, 'ResNet18') + print("=> loading backbone model from '{}'".format(params_path)) + params, _ = fluid.load_dygraph(params_path) + net.load_dict(params) + print("Done") + return net + + +def resnet50(name, is_test=False, pretrained=False): + net = ResNet(name, Block=Bottleneck, layers=50, is_test=is_test) + if pretrained: + params_path = os.path.join(env_settings().backbone_dir, 'ResNet50') + print("=> loading backbone model from '{}'".format(params_path)) + params, _ = fluid.load_dygraph(params_path) + net.load_dict(params) + print("Done") + return net diff --git a/PaddleCV/tracking/ltr/models/backbone/sfc_alexnet.py b/PaddleCV/tracking/ltr/models/backbone/sfc_alexnet.py new file mode 100644 index 0000000000000000000000000000000000000000..4c80df9115f9b34d9ef30ba59c54b45d9b8da87a --- /dev/null +++ b/PaddleCV/tracking/ltr/models/backbone/sfc_alexnet.py @@ -0,0 +1,149 @@ +from collections import OrderedDict + +from paddle import fluid +from paddle.fluid.dygraph import nn + + +class SFC_AlexNet(fluid.dygraph.Layer): + def __init__(self, name, is_test): + super(SFC_AlexNet, self).__init__() + + self.is_test = is_test + self.layer_init() + + def layer_init(self): + # for conv1 + self.conv1 = nn.Conv2D( + num_channels=3, + num_filters=96, + filter_size=11, + stride=2, + padding=0, + groups=1, + param_attr=self.weight_init(), + bias_attr=self.bias_init()) + self.bn1 = nn.BatchNorm( + num_channels=96, + is_test=self.is_test, + param_attr=self.norm_weight_init(), + bias_attr=self.bias_init(), + use_global_stats=self.is_test) + self.pool1 = nn.Pool2D( + pool_size=3, pool_type="max", pool_stride=2, pool_padding=0) + # for conv2 + self.conv2 = nn.Conv2D( + num_channels=96, + num_filters=256, + filter_size=5, + stride=1, + padding=0, + groups=2, + param_attr=self.weight_init(), + bias_attr=self.bias_init()) + self.bn2 = nn.BatchNorm( + num_channels=256, + is_test=self.is_test, + param_attr=self.norm_weight_init(), + bias_attr=self.bias_init(), + use_global_stats=self.is_test) + self.pool2 = nn.Pool2D( + pool_size=3, pool_type="max", pool_stride=2, pool_padding=0) + # for conv3 + self.conv3 = nn.Conv2D( + num_channels=256, + num_filters=384, + filter_size=3, + stride=1, + padding=0, + groups=1, + param_attr=self.weight_init(), + bias_attr=self.bias_init()) + self.bn3 = nn.BatchNorm( + num_channels=384, + is_test=self.is_test, + param_attr=self.norm_weight_init(), + bias_attr=self.bias_init(), + use_global_stats=self.is_test) + # for conv4 + self.conv4 = nn.Conv2D( + num_channels=384, + num_filters=384, + filter_size=3, + stride=1, + padding=0, + groups=2, + param_attr=self.weight_init(), + bias_attr=self.bias_init()) + self.bn4 = nn.BatchNorm( + num_channels=384, + is_test=self.is_test, + param_attr=self.norm_weight_init(), + bias_attr=self.bias_init(), + use_global_stats=self.is_test) + # for conv5 + self.conv5 = nn.Conv2D( + num_channels=384, + num_filters=256, + filter_size=3, + stride=1, + padding=0, + groups=2, + param_attr=self.weight_init(), + bias_attr=self.bias_init()) + + def _add_output_and_check(self, name, x, outputs, output_layers): + if name in output_layers: + outputs[name] = x + return len(output_layers) == len(outputs) + + def forward(self, inputs, output_layers): + outputs = OrderedDict() + + out1 = self.conv1(inputs) + out1 = self.bn1(out1) + out1 = fluid.layers.relu(out1) + if self._add_output_and_check('conv1', out1, outputs, output_layers): + return outputs + + out1 = self.pool1(out1) + + out2 = self.conv2(out1) + out2 = self.bn2(out2) + out2 = fluid.layers.relu(out2) + if self._add_output_and_check('conv2', out2, outputs, output_layers): + return outputs + + out2 = self.pool2(out2) + + out3 = self.conv3(out2) + out3 = self.bn3(out3) + out3 = fluid.layers.relu(out3) + if self._add_output_and_check('conv3', out3, outputs, output_layers): + return outputs + + out4 = self.conv4(out3) + out4 = self.bn4(out4) + out4 = fluid.layers.relu(out4) + if self._add_output_and_check('conv4', out4, outputs, output_layers): + return outputs + + out5 = self.conv5(out4) + if self._add_output_and_check('conv5', out5, outputs, output_layers): + return outputs + + return outputs + + def norm_weight_init(self): + init = fluid.initializer.ConstantInitializer(1.0) + param = fluid.ParamAttr(initializer=init) + return param + + def weight_init(self): + init = fluid.initializer.MSRAInitializer(uniform=False) + param = fluid.ParamAttr(initializer=init) + return param + + def bias_init(self): + init = fluid.initializer.ConstantInitializer(value=0.) + param = fluid.ParamAttr(initializer=init) + return param diff --git a/PaddleCV/tracking/ltr/models/bbreg/__init__.py b/PaddleCV/tracking/ltr/models/bbreg/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2f2a4538cf68a96c1a3384eb197fbe5e53d500b7 --- /dev/null +++ b/PaddleCV/tracking/ltr/models/bbreg/__init__.py @@ -0,0 +1 @@ +from .atom_iou_net import AtomIouNet diff --git a/PaddleCV/tracking/ltr/models/bbreg/atom.py b/PaddleCV/tracking/ltr/models/bbreg/atom.py new file mode 100644 index 0000000000000000000000000000000000000000..ef0ebd2e84964e98efd55454c8fc1d488a13cc66 --- /dev/null +++ b/PaddleCV/tracking/ltr/models/bbreg/atom.py @@ -0,0 +1,149 @@ +import paddle +import paddle.fluid as fluid +import paddle.fluid.dygraph as dygraph +import os.path as osp +import sys + +CURRENT_DIR = osp.dirname(__file__) +sys.path.append(osp.join(CURRENT_DIR, '..', '..', '..')) + +from ltr.models.backbone.resnet import resnet50, resnet18 +from ltr.models.bbreg.atom_iou_net import AtomIouNet + + +class ATOMnet(dygraph.layers.Layer): + def __init__(self, + name, + feature_extractor, + bb_regressor, + bb_regressor_layer, + extractor_grad=True): + """ + + :param feature_extractor: backbone + :param bb_regressor: IOUnet + :param bb_regressor_layer: list, which layer is used in IOUnet, + :param extractor_grad: default is True + """ + super(ATOMnet, self).__init__(name) + + self.feature_extractor = feature_extractor + self.bb_regressor = bb_regressor + self.bb_regressor_layer = bb_regressor_layer + + layers_gt = ['block0', 'block1', 'block2', 'block3', 'fc'] + if bb_regressor_layer is not None: + for key in bb_regressor_layer: + assert key in layers_gt + else: + raise ValueError("bb_regressor_layer can only be one of :", + layers_gt) + + def forward(self, train_imgs, test_imgs, train_bb, test_proposals): + num_sequences = train_imgs.shape[-4] + num_train_images = train_imgs.shape[0] if len( + train_imgs.shape) == 5 else 1 + num_test_images = test_imgs.shape[0] if len(test_imgs.shape) == 5 else 1 + + if len(train_imgs.shape) == 5: + train_imgs = fluid.layers.reshape( + train_imgs, [-1, *list(train_imgs.shape)[-3:]]) + test_imgs = fluid.layers.reshape(test_imgs, + [-1, *list(test_imgs.shape)[-3:]]) + + train_feat = self.extract_backbone_features(train_imgs) + test_feat = self.extract_backbone_features(test_imgs) + + # For clarity, send the features to bb_regressor in sequenceform, i.e. [sequence, batch, feature, row, col] + train_feat_iou = [ + fluid.layers.reshape(feat, (num_train_images, num_sequences, + *feat.shape[-3:])) + for feat in train_feat.values() + ] + test_feat_iou = [ + fluid.layers.reshape(feat, (num_test_images, num_sequences, + *feat.shape[-3:])) + for feat in test_feat.values() + ] + + # Obtain iou prediction + iou_pred = self.bb_regressor(train_feat_iou, test_feat_iou, train_bb, + test_proposals) + return iou_pred + + def extract_backbone_features(self, im, layers=None): + if layers is None: + layers = self.bb_regressor_layer + return self.feature_extractor(im, layers) + + def extract_features(self, im, layers): + return self.feature_extractor(im, layers) + + +def atom_resnet18(iou_input_dim=(256, 256), + iou_inter_dim=(256, 256), + backbone_pretrained=True, + backbone_is_test=False, + iounet_is_test=False): + backbone = resnet18( + 'ResNet18', is_test=backbone_is_test, pretrained=backbone_pretrained) + iou_predictor = AtomIouNet( + 'IOUnet', + pred_input_dim=iou_input_dim, + pred_inter_dim=iou_inter_dim, + is_test=iounet_is_test) + + model = ATOMnet( + 'ATOM', + feature_extractor=backbone, + bb_regressor=iou_predictor, + bb_regressor_layer=['block1', 'block2'], + extractor_grad=False) + return model + + +def atom_resnet50(iou_input_dim=(256, 256), + iou_inter_dim=(256, 256), + backbone_pretrained=True, + backbone_is_test=False, + iounet_is_test=False): + backbone = resnet50( + 'ResNet50', is_test=backbone_is_test, pretrained=backbone_pretrained) + iou_predictor = AtomIouNet( + 'IOUnet', + input_dim=(512, 1024), + pred_input_dim=iou_input_dim, + pred_inter_dim=iou_inter_dim, + is_test=iounet_is_test) + + model = ATOMnet( + 'ATOM', + feature_extractor=backbone, + bb_regressor=iou_predictor, + bb_regressor_layer=['block1', 'block2'], + extractor_grad=False) + return model + + +if __name__ == '__main__': + import numpy as np + + a = np.random.uniform(-1, 1, [1, 3, 144, 144]).astype(np.float32) + b = np.random.uniform(-1, 1, [1, 3, 144, 144]).astype(np.float32) + bbox = [[3, 4, 10, 11]] + proposal_bbox = [[4, 5, 11, 12] * 16] + bbox = np.reshape(np.array(bbox), [1, 1, 4]).astype(np.float32) + proposal_bbox = np.reshape(np.array(proposal_bbox), + [1, 16, 4]).astype(np.float32) + with fluid.dygraph.guard(): + a_pd = fluid.dygraph.to_variable(a) + b_pd = fluid.dygraph.to_variable(b) + bbox_pd = fluid.dygraph.to_variable(bbox) + proposal_bbox_pd = fluid.dygraph.to_variable(proposal_bbox) + + model = atom_resnet50() + + res = model(a_pd, b_pd, bbox_pd, proposal_bbox_pd) + params = model.state_dict() + for v in params: + print(v) diff --git a/PaddleCV/tracking/ltr/models/bbreg/atom_iou_net.py b/PaddleCV/tracking/ltr/models/bbreg/atom_iou_net.py new file mode 100644 index 0000000000000000000000000000000000000000..7fcb2f0f4f99692c10e8c0a2c2665cc0e6dffacb --- /dev/null +++ b/PaddleCV/tracking/ltr/models/bbreg/atom_iou_net.py @@ -0,0 +1,350 @@ +""" +the implementation of ATOM iou net +""" +import paddle +import paddle.fluid as fluid +import paddle.fluid.dygraph.nn as nn +import numpy as np +import os.path as osp +import sys + +CURRENT_DIR = osp.dirname(__file__) +sys.path.append(osp.join(CURRENT_DIR, '..', '..', '..')) + + +def weight_init(): + init = fluid.initializer.MSRAInitializer(uniform=False) + param = fluid.ParamAttr(initializer=init) + return param + + +def bias_init(): + init = fluid.initializer.ConstantInitializer(value=0.) + param = fluid.ParamAttr(initializer=init) + return param + + +def norm_weight_init(): + # init = fluid.initializer.ConstantInitializer(1.0) + init = fluid.initializer.Uniform(low=0., high=1.) + param = fluid.ParamAttr(initializer=init) + return param + + +def norm_bias_init(): + init = fluid.initializer.ConstantInitializer(value=0.) + param = fluid.ParamAttr(initializer=init) + return param + + +class ConvBNReluLayer(fluid.dygraph.Layer): + def __init__(self, + in_channels, + out_channels, + filter_size, + stride=1, + groups=1, + padding=1, + is_test=False): + super(ConvBNReluLayer, self).__init__() + + self.conv = nn.Conv2D( + num_channels=in_channels, + filter_size=filter_size, + num_filters=out_channels, + stride=stride, + padding=padding, + groups=groups, + bias_attr=bias_init(), + param_attr=weight_init()) + self.bn = nn.BatchNorm( + out_channels, + param_attr=norm_weight_init(), + bias_attr=norm_bias_init(), + act=None, + momentum=0.9, + use_global_stats=is_test) + + def forward(self, inputs): + res = self.conv(inputs) + self.conv_res = res + res = self.bn(res) + res = fluid.layers.relu(res) + return res + + +class FCBNReluLayer(fluid.dygraph.Layer): + def __init__(self, + in_channels, + out_channels, + in_size, + is_bias=True, + is_bn=True, + is_relu=True, + is_test=False): + super(FCBNReluLayer, self).__init__() + self.is_bn = is_bn + self.is_relu = is_relu + + if is_bias: + bias_init = fluid.ParamAttr( + initializer=fluid.initializer.ConstantInitializer(0.)) + else: + bias_init = False + self.linear = nn.Linear( + in_channels * in_size * in_size, out_channels, bias_attr=bias_init) + self.bn = nn.BatchNorm( + out_channels, + param_attr=norm_weight_init(), + bias_attr=norm_bias_init(), + act=None, + momentum=0.9, + use_global_stats=is_test) + + def forward(self, x): + x = fluid.layers.reshape(x, [x.shape[0], -1]) + + x = self.linear(x) + if self.is_bn: + x = self.bn(x) + if self.is_relu: + x = fluid.layers.relu(x) + return x + + +class AtomIouNet(fluid.dygraph.Layer): + def __init__(self, + name, + input_dim=(128, 256), + pred_input_dim=(256, 256), + pred_inter_dim=(256, 256), + is_test=False): + super(AtomIouNet, self).__init__(name) + self.name = self.full_name() + self.conv3_1r = ConvBNReluLayer( + input_dim[0], 128, filter_size=3, stride=1, is_test=is_test) + self.conv3_1t = ConvBNReluLayer( + input_dim[0], 256, filter_size=3, stride=1, is_test=is_test) + + self.conv3_2t = ConvBNReluLayer( + 256, pred_input_dim[0], filter_size=3, stride=1, is_test=is_test) + + self.fc3_1r = ConvBNReluLayer( + 128, 256, filter_size=3, stride=1, padding=0, is_test=is_test) + + self.conv4_1r = ConvBNReluLayer( + input_dim[1], 256, filter_size=3, stride=1, is_test=is_test) + self.conv4_1t = ConvBNReluLayer( + input_dim[1], 256, filter_size=3, stride=1, is_test=is_test) + + self.conv4_2t = ConvBNReluLayer( + 256, pred_input_dim[1], filter_size=3, stride=1, is_test=is_test) + + self.fc34_3r = ConvBNReluLayer( + 512, + pred_input_dim[0], + filter_size=1, + stride=1, + padding=0, + is_test=is_test) + self.fc34_4r = ConvBNReluLayer( + 512, + pred_input_dim[1], + filter_size=1, + stride=1, + padding=0, + is_test=is_test) + + self.fc3_rt = FCBNReluLayer( + pred_input_dim[0], pred_inter_dim[0], in_size=5, is_test=is_test) + self.fc4_rt = FCBNReluLayer( + pred_input_dim[1], pred_inter_dim[1], in_size=3, is_test=is_test) + + bias_init = fluid.initializer.ConstantInitializer(0.) + self.iou_predictor = nn.Linear( + pred_inter_dim[0] + pred_inter_dim[1], 1, bias_attr=bias_init) + + self.outs = {} + + def predict_iou(self, filter, feat2, proposals): + """ + predicts IOU for the given proposals + :param modulation: Modulation vectors for the targets. Dims (batch, feature_dim). + :param feat: IoU features (from get_iou_feat) for test images. Dims (batch, feature_dim, H, W). + :param proposals: Proposal boxes for which the IoU will be predicted (batch, num_proposals, 4). + :return: + """ + fc34_3_r, fc34_4_r = filter + c3_t, c4_t = feat2 + + batch_size = c3_t.shape[0] + + # Modulation + c3_t_att = c3_t * fluid.layers.reshape(fc34_3_r, [batch_size, -1, 1, 1]) + c4_t_att = c4_t * fluid.layers.reshape(fc34_4_r, [batch_size, -1, 1, 1]) + + # add batch roi nums + num_proposals_per_batch = proposals.shape[1] + batch_roi_nums = np.array([num_proposals_per_batch] * + batch_size).astype(np.int64) + batch_roi_nums = fluid.dygraph.to_variable(batch_roi_nums) + + # input proposals2 is in format xywh, convert it to x0y0x1y1 format + proposals_xyxy = fluid.layers.concat( + [ + proposals[:, :, 0:2], + proposals[:, :, 0:2] + proposals[:, :, 2:4] + ], + axis=2) + + roi2 = fluid.layers.reshape(proposals_xyxy, [-1, 4]) + roi2.stop_gradient = False + + roi3t = fluid.layers.prroi_pool( + c3_t_att, roi2, 1 / 8., 5, 5, batch_roi_nums=batch_roi_nums) + roi4t = fluid.layers.prroi_pool( + c4_t_att, roi2, 1 / 16., 3, 3, batch_roi_nums=batch_roi_nums) + + fc3_rt = self.fc3_rt(roi3t) + fc4_rt = self.fc4_rt(roi4t) + + fc34_rt_cat = fluid.layers.concat([fc3_rt, fc4_rt], axis=1) + + iou_pred = self.iou_predictor(fc34_rt_cat) + iou_pred = fluid.layers.reshape(iou_pred, + [batch_size, num_proposals_per_batch]) + + return iou_pred + + def forward(self, feat1, feat2, bb1, proposals2): + """Runs the ATOM IoUNet during training operation. + This forward pass is mainly used for training. Call the individual functions during tracking instead. + args: + feat1: Variable, Features from the reference frames (4 or 5 dims). + feat2: Variable, Features from the test frames (4 or 5 dims). + bb1: Target boxes (x,y,x2,y2) in image coords in the reference samples. Dims (images, sequences, 4). + proposals2: Proposal boxes for which the IoU will be predicted (images, sequences, num_proposals, 4).""" + assert len(feat1[0].shape) == 5, 'Expect 5 dimensional feat1' + num_test_images = feat2[0].shape[0] + batch_size = feat2[0].shape[1] + + # Extract first train sample + feat1 = [f[0] for f in feat1] + bb1 = bb1[0] + + # Get modulation vector + modulation = self.get_filter(feat1, bb1) + + feat2 = [ + fluid.layers.reshape(f, + (batch_size * num_test_images, *f.shape[-3:])) + for f in feat2 + ] + iou_feat = self.get_iou_feat(feat2) + + new_modulation = [] + for i in range(0, len(modulation)): + tmp = modulation[i] + tmp = fluid.layers.reshape(tmp, [1, batch_size, -1]) + tmp = fluid.layers.expand(tmp, [num_test_images, 1, 1]) + tmp = fluid.layers.reshape(tmp, [batch_size * num_test_images, -1]) + new_modulation.append(tmp) + + proposals2 = fluid.layers.reshape( + proposals2, [batch_size * num_test_images, -1, 4]) + + pred_iou = self.predict_iou(new_modulation, iou_feat, proposals2) + pred_iou = fluid.layers.reshape(pred_iou, + [num_test_images, batch_size, -1]) + return pred_iou + + def get_filter(self, feat1, bb1): + """ + get modulation feature [feature1, feature2] for the targets + :param feat1: variable, Backbone features from reference images. shapes (batch, feature_dim, H, W). + :param bb1: variable, Target boxes (x,y,w,h) in image coords in the reference samples. shapes (batch, 4). + :return: + """ + feat3_r, feat4_r = feat1 + + c3_r = self.conv3_1r(feat3_r) + + # Add batch_index to rois + batch_size = bb1.shape[0] + batch_roi_nums = np.array([1] * batch_size).astype(np.int64) + batch_roi_nums = fluid.dygraph.to_variable(batch_roi_nums) + + # input bb is in format xywh, convert it to x0y0x1y1 format + roi1 = fluid.layers.concat( + [bb1[:, 0:2], bb1[:, 0:2] + bb1[:, 2:4]], axis=1) + roi1.stop_gradient = False + + roi3r = fluid.layers.prroi_pool(c3_r, roi1, 1 / 8., 3, 3, + batch_roi_nums) + + c4_r = self.conv4_1r(feat4_r) + roi4r = fluid.layers.prroi_pool(c4_r, roi1, 1 / 16., 1, 1, + batch_roi_nums) + + fc3_r = self.fc3_1r(roi3r) + + # Concatenate + fc34_r = fluid.layers.concat([fc3_r, roi4r], axis=1) + + fc34_3_r = self.fc34_3r(fc34_r) + fc34_4_r = self.fc34_4r(fc34_r) + + return fc34_3_r, fc34_4_r + + def get_iou_feat(self, feat2): + """ + Get IoU prediction features from a 4 or 5 dimensional backbone input. + :param feat2: variable, Backbone features from reference images. [feature1, feature2] + :return: features, variable + """ + feat3_t, feat4_t = feat2 + c3_t = self.conv3_2t(self.conv3_1t(feat3_t)) + c4_t = self.conv4_2t(self.conv4_1t(feat4_t)) + + return c3_t, c4_t + + +def atom_iounet(name, + input_dim=(128, 256), + pred_input_dim=(256, 256), + pred_inter_dim=(256, 256)): + return AtomIouNet( + name, + input_dim=input_dim, + pred_input_dim=pred_input_dim, + pred_inter_dim=pred_inter_dim) + + +def test_paddle_iounet(): + a = np.random.uniform(-1, 1, [1, 1, 512, 18, 18]).astype(np.float32) + b = np.random.uniform(-1, 1, [1, 1, 1024, 9, 9]).astype(np.float32) + bbox = [[3, 4, 10, 11]] + proposal_bbox = [[4, 5, 11, 12] * 16] + bbox = np.reshape(np.array(bbox), [1, 1, 4]).astype(np.float32) + proposal_bbox = np.reshape(np.array(proposal_bbox), + [1, 16, 4]).astype(np.float32) + with fluid.dygraph.guard(): + a_pd = fluid.dygraph.to_variable(a) + b_pd = fluid.dygraph.to_variable(b) + bbox_pd = fluid.dygraph.to_variable(bbox) + proposal_bbox_pd = fluid.dygraph.to_variable(proposal_bbox) + feat1 = [a_pd, b_pd] + feat2 = [a_pd, b_pd] + + model = AtomIouNet('IOUNet', input_dim=(512, 1024)) + res = model(feat1, feat2, bbox_pd, proposal_bbox_pd) + print(res.shape) + params = model.state_dict() + + for v in params: + print(v, '\t', params[v].shape) + print(len(params)) + + +if __name__ == '__main__': + test_paddle_iounet() diff --git a/PaddleCV/tracking/ltr/models/siamese/__init__.py b/PaddleCV/tracking/ltr/models/siamese/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8adbdfc323c7a6031904b5acb269bb059a9fc92c --- /dev/null +++ b/PaddleCV/tracking/ltr/models/siamese/__init__.py @@ -0,0 +1 @@ +from .target_estimator_net import SiamFCEstimator diff --git a/PaddleCV/tracking/ltr/models/siamese/siam.py b/PaddleCV/tracking/ltr/models/siamese/siam.py new file mode 100644 index 0000000000000000000000000000000000000000..058015fcc1d50c23b8708af6bb8381b0d8d642a8 --- /dev/null +++ b/PaddleCV/tracking/ltr/models/siamese/siam.py @@ -0,0 +1,64 @@ +from paddle import fluid +from paddle.fluid import dygraph +import ltr.models.siamese.target_estimator_net as tgt_estimator + + +class SiamNet(dygraph.layers.Layer): + def __init__(self, + name, + feature_extractor, + target_estimator, + target_estimator_layer, + extractor_grad=True): + """ + + :param feature_extractor: backbone + :param target_estimator: headers + :param target_estimator_layer: list, which layer is used in header, + :param extractor_grad: default is True + """ + super(SiamNet, self).__init__(name) + + self.feature_extractor = feature_extractor + self.target_estimator = target_estimator + self.target_estimator_layer = target_estimator_layer + + def forward(self, train_imgs, test_imgs): + # extract backbone features + if len(train_imgs.shape) == 5: + train_imgs = fluid.layers.reshape( + train_imgs, [-1, *list(train_imgs.shape)[-3:]]) + test_imgs = fluid.layers.reshape(test_imgs, + [-1, *list(test_imgs.shape)[-3:]]) + + train_feat = self.extract_backbone_features(train_imgs) + test_feat = self.extract_backbone_features(test_imgs) + + train_feat = [feat for feat in train_feat.values()] + test_feat = [feat for feat in test_feat.values()] + + # Obtain target estimation + targets = self.target_estimator(train_feat, test_feat) + return targets + + def extract_backbone_features(self, im, layers=None): + if layers is None: + layers = self.target_estimator_layer + return self.feature_extractor(im, layers) + + def extract_features(self, im, layers): + return self.feature_extractor(im, layers) + + +def siamfc_alexnet(backbone_pretrained=False, + backbone_is_test=False, + estimator_is_test=False): + from ltr.models.backbone.sfc_alexnet import SFC_AlexNet + backbone_net = SFC_AlexNet('AlexNet', is_test=backbone_is_test) + target_estimator = tgt_estimator.SiamFCEstimator('CenterEstimator') + model = SiamNet( + 'SiamFC', + backbone_net, + target_estimator, + ['conv5'], ) + return model diff --git a/PaddleCV/tracking/ltr/models/siamese/target_estimator_net.py b/PaddleCV/tracking/ltr/models/siamese/target_estimator_net.py new file mode 100644 index 0000000000000000000000000000000000000000..e4a676dd4594e4c17fdbf2b16eb92cb40ce451af --- /dev/null +++ b/PaddleCV/tracking/ltr/models/siamese/target_estimator_net.py @@ -0,0 +1,47 @@ +from paddle import fluid +from paddle.fluid import dygraph +from paddle.fluid.dygraph import nn + +from pytracking.libs.Fconv2d import Conv2D + + +class SiamFCEstimator(dygraph.layers.Layer): + def __init__(self, name): + super().__init__(name) + init_w = fluid.ParamAttr( + name="a_weight", + initializer=fluid.initializer.ConstantInitializer(0.001), + learning_rate=0., + trainable=False) + init_b = fluid.ParamAttr( + name="a_bias", + initializer=fluid.initializer.ConstantInitializer(0.), + trainable=True) + + self.adjust_conv = nn.Conv2D( + 1, 1, 1, 1, 0, param_attr=init_w, bias_attr=init_b) + + def forward(self, exemplar, instance): + exemplar_f = self.get_reference(exemplar) + instance_f = self.get_search_feat(instance) + score_map = self.estimate(exemplar_f, instance_f) + return score_map + + def get_reference(self, feat): + # remove list warp + return feat[0] + + def get_search_feat(self, feat): + # remove list warp + return feat[0] + + def estimate(self, exemplar, instance): + shape = instance.shape + instance = fluid.layers.reshape( + instance, shape=[1, -1, shape[2], shape[3]]) + + cross_conv = Conv2D(stride=1, padding=0, dilation=1, groups=shape[0]) + score_map = cross_conv(instance, exemplar) + score_map = fluid.layers.transpose(score_map, [1, 0, 2, 3]) + score_map = self.adjust_conv(score_map) + return score_map diff --git a/PaddleCV/tracking/ltr/run_training.py b/PaddleCV/tracking/ltr/run_training.py new file mode 100644 index 0000000000000000000000000000000000000000..e18fc03ff2738020036f642178948aba87dade0d --- /dev/null +++ b/PaddleCV/tracking/ltr/run_training.py @@ -0,0 +1,60 @@ +import os +import sys +import argparse +import importlib +import multiprocessing +import paddle +import cv2 as cv + +env_path = os.path.join(os.path.dirname(__file__), '..') +if env_path not in sys.path: + sys.path.append(env_path) + +import ltr.admin.settings as ws_settings + + +def run_training(train_module, train_name): + """Run a train scripts in train_settings. + args: + train_module: Name of module in the "train_settings/" folder. + train_name: Name of the train settings file. + """ + # set single threads in opencv + cv.setNumThreads(0) + + print('Training: {} {}'.format(train_module, train_name)) + + settings = ws_settings.Settings() + + if settings.env.workspace_dir == '': + raise Exception('Setup your workspace_dir in "ltr/admin/local.py".') + + settings.module_name = train_module + settings.script_name = train_name + settings.project_path = 'ltr/{}/{}'.format(train_module, train_name) + + expr_module = importlib.import_module('ltr.train_settings.{}.{}'.format( + train_module, train_name)) + expr_func = getattr(expr_module, 'run') + + expr_func(settings) + + +def main(): + parser = argparse.ArgumentParser( + description='Run a train scripts in train_settings.') + parser.add_argument( + 'train_module', + type=str, + help='Name of module in the "train_settings/" folder.') + parser.add_argument( + 'train_name', type=str, help='Name of the train settings file.') + + args = parser.parse_args() + + run_training(args.train_module, args.train_name) + + +if __name__ == '__main__': + multiprocessing.set_start_method('spawn', force=True) + main() diff --git a/PaddleCV/tracking/ltr/train_settings/bbreg/atom_res18_vid_lasot_coco.py b/PaddleCV/tracking/ltr/train_settings/bbreg/atom_res18_vid_lasot_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..688d9c81ad34a947758deea3e0e173344704ef67 --- /dev/null +++ b/PaddleCV/tracking/ltr/train_settings/bbreg/atom_res18_vid_lasot_coco.py @@ -0,0 +1,148 @@ +import paddle.fluid as fluid +import paddle.fluid.dygraph as dygraph + +import ltr.actors as actors +import ltr.data.transforms as dltransforms +from ltr.data import processing, sampler, loader +from ltr.dataset import ImagenetVID, MSCOCOSeq, Lasot, Got10k +from ltr.models.bbreg.atom import atom_resnet50, atom_resnet18 +from ltr.trainers import LTRTrainer + + +def run(settings): + # Most common settings are assigned in the settings struct + settings.description = 'ATOM IoUNet with ResNet18 backbone and trained with vid, lasot, coco.' + settings.print_interval = 1 # How often to print loss and other info + settings.batch_size = 64 # Batch size + settings.num_workers = 4 # Number of workers for image loading + settings.normalize_mean = [0.485, 0.456, 0.406 + ] # Normalize mean (default ImageNet values) + settings.normalize_std = [0.229, 0.224, + 0.225] # Normalize std (default ImageNet values) + settings.search_area_factor = 5.0 # Image patch size relative to target size + settings.feature_sz = 18 # Size of feature map + settings.output_sz = settings.feature_sz * 16 # Size of input image patches + + # Settings for the image sample and proposal generation + settings.center_jitter_factor = {'train': 0, 'test': 4.5} + settings.scale_jitter_factor = {'train': 0, 'test': 0.5} + settings.proposal_params = { + 'min_iou': 0.1, + 'boxes_per_frame': 16, + 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3] + } + + # Train datasets + vid_train = ImagenetVID() + lasot_train = Lasot(split='train') + coco_train = MSCOCOSeq() + + # Validation datasets + got10k_val = Got10k(split='val') + + # The joint augmentation transform, that is applied to the pairs jointly + transform_joint = dltransforms.ToGrayscale(probability=0.05) + + # The augmentation transform applied to the training set (individually to each image in the pair) + transform_train = dltransforms.Compose([ + dltransforms.ToArrayAndJitter(0.2), dltransforms.Normalize( + mean=settings.normalize_mean, std=settings.normalize_std) + ]) + + # The augmentation transform applied to the validation set (individually to each image in the pair) + transform_val = dltransforms.Compose([ + dltransforms.ToArray(), dltransforms.Normalize( + mean=settings.normalize_mean, std=settings.normalize_std) + ]) + + # Data processing to do on the training pairs + data_processing_train = processing.ATOMProcessing( + search_area_factor=settings.search_area_factor, + output_sz=settings.output_sz, + center_jitter_factor=settings.center_jitter_factor, + scale_jitter_factor=settings.scale_jitter_factor, + mode='sequence', + proposal_params=settings.proposal_params, + transform=transform_train, + joint_transform=transform_joint) + + # Data processing to do on the validation pairs + data_processing_val = processing.ATOMProcessing( + search_area_factor=settings.search_area_factor, + output_sz=settings.output_sz, + center_jitter_factor=settings.center_jitter_factor, + scale_jitter_factor=settings.scale_jitter_factor, + mode='sequence', + proposal_params=settings.proposal_params, + transform=transform_val, + joint_transform=transform_joint) + + # The sampler for training + dataset_train = sampler.ATOMSampler( + [vid_train, lasot_train, coco_train], [1, 1, 1], + samples_per_epoch=1000 * settings.batch_size, + max_gap=50, + processing=data_processing_train) + + # The loader for training + train_loader = loader.LTRLoader( + 'train', + dataset_train, + training=True, + batch_size=settings.batch_size, + num_workers=4, + stack_dim=1) + + # The sampler for validation + dataset_val = sampler.ATOMSampler( + [got10k_val], [1, ], + samples_per_epoch=500 * settings.batch_size, + max_gap=50, + processing=data_processing_val) + + # The loader for validation + val_loader = loader.LTRLoader( + 'val', + dataset_val, + training=False, + batch_size=settings.batch_size, + epoch_interval=5, + num_workers=4, + stack_dim=1) + + # creat network, set objective, creat optimizer, learning rate scheduler, trainer + with dygraph.guard(): + # Create network + net = atom_resnet18(backbone_pretrained=True) + + # Freeze backbone + state_dicts = net.state_dict() + for k in state_dicts.keys(): + if 'feature_extractor' in k and "running" not in k: + state_dicts[k].stop_gradient = True + + # Set objective + objective = fluid.layers.square_error_cost + + # Create actor, which wraps network and objective + actor = actors.AtomActor(net=net, objective=objective) + + # Set to training mode + actor.train() + + # define optimizer and learning rate + gama = 0.2 + lr = 1e-3 + lr_scheduler = fluid.dygraph.PiecewiseDecay( + [15, 30, 45], + values=[lr, lr * gama, lr * gama * gama], + step=1000, + begin=0) + + optimizer = fluid.optimizer.Adam( + parameter_list=net.bb_regressor.parameters(), + learning_rate=lr_scheduler) + + trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer, + settings, lr_scheduler) + trainer.train(40, load_latest=False, fail_safe=False) diff --git a/PaddleCV/tracking/ltr/train_settings/bbreg/atom_res50_vid_lasot_coco.py b/PaddleCV/tracking/ltr/train_settings/bbreg/atom_res50_vid_lasot_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..6b4f09bcf361ca539e4f0f3a2754300918df8c33 --- /dev/null +++ b/PaddleCV/tracking/ltr/train_settings/bbreg/atom_res50_vid_lasot_coco.py @@ -0,0 +1,148 @@ +import paddle.fluid as fluid +import paddle.fluid.dygraph as dygraph + +import ltr.actors as actors +import ltr.data.transforms as dltransforms +from ltr.data import processing, sampler, loader +from ltr.dataset import ImagenetVID, MSCOCOSeq, Lasot, Got10k +from ltr.models.bbreg.atom import atom_resnet50, atom_resnet18 +from ltr.trainers import LTRTrainer + + +def run(settings): + # Most common settings are assigned in the settings struct + settings.description = 'ATOM IoUNet with ResNet50 backbone and trained with vid, lasot, coco.' + settings.print_interval = 1 # How often to print loss and other info + settings.batch_size = 64 # Batch size + settings.num_workers = 4 # Number of workers for image loading + settings.normalize_mean = [0.485, 0.456, 0.406 + ] # Normalize mean (default ImageNet values) + settings.normalize_std = [0.229, 0.224, + 0.225] # Normalize std (default ImageNet values) + settings.search_area_factor = 5.0 # Image patch size relative to target size + settings.feature_sz = 18 # Size of feature map + settings.output_sz = settings.feature_sz * 16 # Size of input image patches + + # Settings for the image sample and proposal generation + settings.center_jitter_factor = {'train': 0, 'test': 4.5} + settings.scale_jitter_factor = {'train': 0, 'test': 0.5} + settings.proposal_params = { + 'min_iou': 0.1, + 'boxes_per_frame': 16, + 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3] + } + + # Train datasets + vid_train = ImagenetVID() + lasot_train = Lasot(split='train') + coco_train = MSCOCOSeq() + + # Validation datasets + got10k_val = Got10k(split='val') + + # The joint augmentation transform, that is applied to the pairs jointly + transform_joint = dltransforms.ToGrayscale(probability=0.05) + + # The augmentation transform applied to the training set (individually to each image in the pair) + transform_train = dltransforms.Compose([ + dltransforms.ToArrayAndJitter(0.2), dltransforms.Normalize( + mean=settings.normalize_mean, std=settings.normalize_std) + ]) + + # The augmentation transform applied to the validation set (individually to each image in the pair) + transform_val = dltransforms.Compose([ + dltransforms.ToArray(), dltransforms.Normalize( + mean=settings.normalize_mean, std=settings.normalize_std) + ]) + + # Data processing to do on the training pairs + data_processing_train = processing.ATOMProcessing( + search_area_factor=settings.search_area_factor, + output_sz=settings.output_sz, + center_jitter_factor=settings.center_jitter_factor, + scale_jitter_factor=settings.scale_jitter_factor, + mode='sequence', + proposal_params=settings.proposal_params, + transform=transform_train, + joint_transform=transform_joint) + + # Data processing to do on the validation pairs + data_processing_val = processing.ATOMProcessing( + search_area_factor=settings.search_area_factor, + output_sz=settings.output_sz, + center_jitter_factor=settings.center_jitter_factor, + scale_jitter_factor=settings.scale_jitter_factor, + mode='sequence', + proposal_params=settings.proposal_params, + transform=transform_val, + joint_transform=transform_joint) + + # The sampler for training + dataset_train = sampler.ATOMSampler( + [vid_train, lasot_train, coco_train], [1, 1, 1], + samples_per_epoch=1000 * settings.batch_size, + max_gap=50, + processing=data_processing_train) + + # The loader for training + train_loader = loader.LTRLoader( + 'train', + dataset_train, + training=True, + batch_size=settings.batch_size, + num_workers=4, + stack_dim=1) + + # The sampler for validation + dataset_val = sampler.ATOMSampler( + [got10k_val], [1, ], + samples_per_epoch=500 * settings.batch_size, + max_gap=50, + processing=data_processing_val) + + # The loader for validation + val_loader = loader.LTRLoader( + 'val', + dataset_val, + training=False, + batch_size=settings.batch_size, + num_workers=4, + epoch_interval=5, + stack_dim=1) + + # creat network, set objective, creat optimizer, learning rate scheduler, trainer + with dygraph.guard(): + # Create network + net = atom_resnet50(backbone_pretrained=True) + + # Freeze backbone + state_dicts = net.state_dict() + for k in state_dicts.keys(): + if 'feature_extractor' in k and "running" not in k: + state_dicts[k].stop_gradient = True + + # Set objective + objective = fluid.layers.square_error_cost + + # Create actor, which wraps network and objective + actor = actors.AtomActor(net=net, objective=objective) + + # Set to training mode + actor.train() + + # define optimizer and learning rate + gama = 0.2 + lr = 1e-3 + lr_scheduler = fluid.dygraph.PiecewiseDecay( + [15, 30, 45], + values=[lr, lr * gama, lr * gama * gama], + step=1000, + begin=0) + + optimizer = fluid.optimizer.Adam( + parameter_list=net.bb_regressor.parameters(), + learning_rate=lr_scheduler) + + trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer, + settings, lr_scheduler) + trainer.train(40, load_latest=False, fail_safe=False) diff --git a/PaddleCV/tracking/ltr/train_settings/siamfc/siamfc_alexnet_vid.py b/PaddleCV/tracking/ltr/train_settings/siamfc/siamfc_alexnet_vid.py new file mode 100644 index 0000000000000000000000000000000000000000..03c5826ec7a351fb7097964e45afd63476fc51d2 --- /dev/null +++ b/PaddleCV/tracking/ltr/train_settings/siamfc/siamfc_alexnet_vid.py @@ -0,0 +1,181 @@ +import paddle.fluid as fluid +import paddle.fluid.dygraph as dygraph + +import ltr.actors as actors +import ltr.data.transforms as dltransforms +from ltr.data import processing, sampler, loader +from ltr.dataset import ImagenetVID, Got10k +from ltr.models.siamese.siam import siamfc_alexnet +from ltr.trainers import LTRTrainer +import numpy as np +import cv2 as cv +from PIL import Image, ImageEnhance + + +class DataAug(dltransforms.Transform): + def __init__(self): + pass + + def random_blur(self, img): + k = np.random.choice([3, 5, 7]) + return cv.GaussianBlur(img, (k, k), sigmaX=0, sigmaY=0) + + def brightness(self, img): + img = Image.fromarray(img.astype('uint8')) + enh_bri = ImageEnhance.Brightness(img) + brightness = np.random.choice(np.linspace(0.5, 1.25, 4)) + img_brighted = enh_bri.enhance(brightness) + + return np.array(img_brighted) + + def contrast(self, img): + img = Image.fromarray(img.astype('uint8')) + enh_con = ImageEnhance.Contrast(img) + contrast = np.random.choice(np.linspace(0.5, 1.25, 4)) + image_contrasted = enh_con.enhance(contrast) + + return np.array(image_contrasted) + + def no_aug(self, img): + return img + + def flip(self, img): + return cv.flip(img, 1) + + def transform(self, img, *args): + func = np.random.choice( + [self.contrast, self.random_blur, self.brightness, self.flip]) + return func(img) + + +def run(settings): + # Most common settings are assigned in the settings struct + settings.description = 'SiamFC with Alexnet backbone and trained with vid' + settings.print_interval = 100 # How often to print loss and other info + settings.batch_size = 8 # Batch size + settings.num_workers = 8 # Number of workers for image loading + settings.normalize_mean = [0., 0., 0.] # Normalize mean + settings.normalize_std = [1 / 255., 1 / 255., 1 / 255.] # Normalize std + settings.search_area_factor = { + 'train': 1.0, + 'test': 2.0078740157480315 + } # roughly the same as SiamFC + settings.output_sz = {'train': 127, 'test': 255} + settings.scale_type = 'context' + settings.border_type = 'meanpad' + + # Settings for the image sample and proposal generation + settings.center_jitter_factor = {'train': 0, 'test': 0} + settings.scale_jitter_factor = {'train': 0, 'test': 0.} + + # Train datasets + vid_train = ImagenetVID() + + # Validation datasets + got10k_val = vid_train #Got10k(split='val') + + # The joint augmentation transform, that is applied to the pairs jointly + transform_joint = dltransforms.ToGrayscale(probability=0.25) + + # The augmentation transform applied to the training set (individually to each image in the pair) + transform_exemplar = dltransforms.Compose([ + dltransforms.ToArray(), dltransforms.Normalize( + mean=settings.normalize_mean, std=settings.normalize_std) + ]) + transform_instance = dltransforms.Compose([ + DataAug(), dltransforms.ToArray(), dltransforms.Normalize( + mean=settings.normalize_mean, std=settings.normalize_std) + ]) + + # Data processing to do on the training pairs + data_processing_train = processing.SiamFCProcessing( + search_area_factor=settings.search_area_factor, + output_sz=settings.output_sz, + center_jitter_factor=settings.center_jitter_factor, + scale_jitter_factor=settings.scale_jitter_factor, + scale_type=settings.scale_type, + border_type=settings.border_type, + mode='sequence', + train_transform=transform_exemplar, + test_transform=transform_instance, + joint_transform=transform_joint) + + # Data processing to do on the validation pairs + data_processing_val = processing.SiamFCProcessing( + search_area_factor=settings.search_area_factor, + output_sz=settings.output_sz, + center_jitter_factor=settings.center_jitter_factor, + scale_jitter_factor=settings.scale_jitter_factor, + scale_type=settings.scale_type, + border_type=settings.border_type, + mode='sequence', + transform=transform_exemplar, + joint_transform=transform_joint) + + # The sampler for training + dataset_train = sampler.ATOMSampler( + [vid_train], [1, ], + samples_per_epoch=6650 * settings.batch_size, + max_gap=100, + processing=data_processing_train) + + # The loader for training + train_loader = loader.LTRLoader( + 'train', + dataset_train, + training=True, + batch_size=settings.batch_size, + num_workers=settings.num_workers, + stack_dim=1) + + # The sampler for validation + dataset_val = sampler.ATOMSampler( + [got10k_val], [1, ], + samples_per_epoch=1000 * settings.batch_size, + max_gap=100, + processing=data_processing_val) + + # The loader for validation + val_loader = loader.LTRLoader( + 'val', + dataset_val, + training=False, + batch_size=settings.batch_size, + num_workers=settings.num_workers, + epoch_interval=5, + stack_dim=1) + + # creat network, set objective, creat optimizer, learning rate scheduler, trainer + with dygraph.guard(): + # Create network + net = siamfc_alexnet() + + # Create actor, which wraps network and objective + actor = actors.SiamFCActor( + net=net, + objective=None, + batch_size=settings.batch_size, + shape=(17, 17), + radius=16, + stride=8) + + # Set to training mode + actor.train() + + # define optimizer and learning rate + lr_scheduler = fluid.layers.exponential_decay( + learning_rate=0.01, + decay_steps=6650, + decay_rate=0.8685, + staircase=True) + regularizer = fluid.regularizer.L2DecayRegularizer( + regularization_coeff=0.0005) + optimizer = fluid.optimizer.Momentum( + momentum=0.9, + regularization=regularizer, + parameter_list=net.parameters(), + learning_rate=lr_scheduler) + + trainer = LTRTrainer(actor, [train_loader], optimizer, settings, + lr_scheduler) + trainer.train(50, load_latest=False, fail_safe=False) diff --git a/PaddleCV/tracking/ltr/train_settings/siamfc/siamfc_alexnet_vid_replicate.py b/PaddleCV/tracking/ltr/train_settings/siamfc/siamfc_alexnet_vid_replicate.py new file mode 100644 index 0000000000000000000000000000000000000000..3ca4695a3232b55fcdb226bd34358903bbcabc32 --- /dev/null +++ b/PaddleCV/tracking/ltr/train_settings/siamfc/siamfc_alexnet_vid_replicate.py @@ -0,0 +1,181 @@ +import paddle.fluid as fluid +import paddle.fluid.dygraph as dygraph + +import ltr.actors as actors +import ltr.data.transforms as dltransforms +from ltr.data import processing, sampler, loader +from ltr.dataset import ImagenetVID, Got10k +from ltr.models.siamese.siam import siamfc_alexnet +from ltr.trainers import LTRTrainer +import numpy as np +import cv2 as cv +from PIL import Image, ImageEnhance + + +class DataAug(dltransforms.Transform): + def __init__(self): + pass + + def random_blur(self, img): + k = np.random.choice([3, 5, 7]) + return cv.GaussianBlur(img, (k, k), sigmaX=0, sigmaY=0) + + def brightness(self, img): + img = Image.fromarray(img.astype('uint8')) + enh_bri = ImageEnhance.Brightness(img) + brightness = np.random.choice(np.linspace(0.5, 1.25, 4)) + img_brighted = enh_bri.enhance(brightness) + + return np.array(img_brighted) + + def contrast(self, img): + img = Image.fromarray(img.astype('uint8')) + enh_con = ImageEnhance.Contrast(img) + contrast = np.random.choice(np.linspace(0.5, 1.25, 4)) + image_contrasted = enh_con.enhance(contrast) + + return np.array(image_contrasted) + + def no_aug(self, img): + return img + + def flip(self, img): + return cv.flip(img, 1) + + def transform(self, img, *args): + func = np.random.choice( + [self.contrast, self.random_blur, self.brightness, self.flip]) + return func(img) + + +def run(settings): + # Most common settings are assigned in the settings struct + settings.description = 'SiamFC with Alexnet backbone and trained with vid' + settings.print_interval = 1 # How often to print loss and other info + settings.batch_size = 8 # Batch size + settings.num_workers = 8 # Number of workers for image loading + settings.normalize_mean = [0., 0., 0.] # Normalize mean + settings.normalize_std = [1 / 255., 1 / 255., 1 / 255.] # Normalize std + settings.search_area_factor = { + 'train': 1.0, + 'test': 2.0078740157480315 + } # roughly the same as SiamFC + settings.output_sz = {'train': 127, 'test': 255} + settings.scale_type = 'context' + settings.border_type = 'replicate' + + # Settings for the image sample and proposal generation + settings.center_jitter_factor = {'train': 0, 'test': 0} + settings.scale_jitter_factor = {'train': 0, 'test': 0.} + + # Train datasets + vid_train = ImagenetVID() + + # Validation datasets + got10k_val = Got10k(split='val') + + # The joint augmentation transform, that is applied to the pairs jointly + transform_joint = dltransforms.ToGrayscale(probability=0.25) + + # The augmentation transform applied to the training set (individually to each image in the pair) + transform_exemplar = dltransforms.Compose([ + dltransforms.ToArray(), dltransforms.Normalize( + mean=settings.normalize_mean, std=settings.normalize_std) + ]) + transform_instance = dltransforms.Compose([ + DataAug(), dltransforms.ToArray(), dltransforms.Normalize( + mean=settings.normalize_mean, std=settings.normalize_std) + ]) + + # Data processing to do on the training pairs + data_processing_train = processing.SiamFCProcessing( + search_area_factor=settings.search_area_factor, + output_sz=settings.output_sz, + center_jitter_factor=settings.center_jitter_factor, + scale_jitter_factor=settings.scale_jitter_factor, + scale_type=settings.scale_type, + border_type=settings.border_type, + mode='sequence', + train_transform=transform_exemplar, + test_transform=transform_instance, + joint_transform=transform_joint) + + # Data processing to do on the validation pairs + data_processing_val = processing.SiamFCProcessing( + search_area_factor=settings.search_area_factor, + output_sz=settings.output_sz, + center_jitter_factor=settings.center_jitter_factor, + scale_jitter_factor=settings.scale_jitter_factor, + scale_type=settings.scale_type, + border_type=settings.border_type, + mode='sequence', + transform=transform_exemplar, + joint_transform=transform_joint) + + # The sampler for training + dataset_train = sampler.ATOMSampler( + [vid_train], [1, ], + samples_per_epoch=6650 * settings.batch_size, + max_gap=100, + processing=data_processing_train) + + # The loader for training + train_loader = loader.LTRLoader( + 'train', + dataset_train, + training=True, + batch_size=settings.batch_size, + num_workers=settings.num_workers, + stack_dim=1) + + # The sampler for validation + dataset_val = sampler.ATOMSampler( + [got10k_val], [1, ], + samples_per_epoch=1000 * settings.batch_size, + max_gap=100, + processing=data_processing_val) + + # The loader for validation + val_loader = loader.LTRLoader( + 'val', + dataset_val, + training=False, + batch_size=settings.batch_size, + num_workers=settings.num_workers, + epoch_interval=5, + stack_dim=1) + + # creat network, set objective, creat optimizer, learning rate scheduler, trainer + with dygraph.guard(): + # Create network + net = siamfc_alexnet() + + # Create actor, which wraps network and objective + actor = actors.SiamFCActor( + net=net, + objective=None, + batch_size=settings.batch_size, + shape=(17, 17), + radius=16, + stride=8) + + # Set to training mode + actor.train() + + # define optimizer and learning rate + lr_scheduler = fluid.layers.exponential_decay( + learning_rate=0.01, + decay_steps=6650, + decay_rate=0.8685, + staircase=True) + regularizer = fluid.regularizer.L2DecayRegularizer( + regularization_coeff=0.0005) + optimizer = fluid.optimizer.Momentum( + momentum=0.9, + regularization=regularizer, + parameter_list=net.parameters(), + learning_rate=lr_scheduler) + + trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer, + settings, lr_scheduler) + trainer.train(50, load_latest=False, fail_safe=False) diff --git a/PaddleCV/tracking/ltr/trainers/__init__.py b/PaddleCV/tracking/ltr/trainers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..33ff4417a25665f5d92481aef449f8d7b2386e13 --- /dev/null +++ b/PaddleCV/tracking/ltr/trainers/__init__.py @@ -0,0 +1,2 @@ +from .base_trainer import BaseTrainer +from .ltr_trainer import LTRTrainer diff --git a/PaddleCV/tracking/ltr/trainers/base_trainer.py b/PaddleCV/tracking/ltr/trainers/base_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..99a206d96a9ef4b0a819150f8c9e589e542deeaf --- /dev/null +++ b/PaddleCV/tracking/ltr/trainers/base_trainer.py @@ -0,0 +1,156 @@ +import os +import glob +from paddle import fluid +from paddle.fluid import dygraph +import pickle + + +class BaseTrainer: + """Base trainer class. Contains functions for training and saving/loading chackpoints. + Trainer classes should inherit from this one and overload the train_epoch function.""" + + def __init__(self, actor, loaders, optimizer, settings, lr_scheduler=None): + """ + args: + actor - The actor for training the network + loaders - list of dataset loaders, e.g. [train_loader, val_loader]. In each epoch, the trainer runs one + epoch for each loader. + optimizer - The optimizer used for training, e.g. Adam + settings - Training settings + lr_scheduler - Learning rate scheduler + """ + self.actor = actor + self.optimizer = optimizer + self.lr_scheduler = lr_scheduler + self.loaders = loaders + + self.update_settings(settings) + + self.epoch = 0 + self.stats = {} + + def update_settings(self, settings=None): + """Updates the trainer settings. Must be called to update internal settings.""" + if settings is not None: + self.settings = settings + + if self.settings.env.workspace_dir is not None: + self.settings.env.workspace_dir = os.path.expanduser( + self.settings.env.workspace_dir) + self._checkpoint_dir = os.path.join(self.settings.env.workspace_dir, + 'checkpoints') + if not os.path.exists(self._checkpoint_dir): + os.makedirs(self._checkpoint_dir) + else: + self._checkpoint_dir = None + + def train(self, max_epochs, load_latest=False, fail_safe=True): + """Do training for the given number of epochs. + args: + max_epochs - Max number of training epochs, + load_latest - Bool indicating whether to resume from latest epoch. + fail_safe - Bool indicating whether the training to automatically restart in case of any crashes. + """ + + num_tries = 10 + for i in range(num_tries): + try: + if load_latest: + self.load_checkpoint() + + for epoch in range(self.epoch + 1, max_epochs + 1): + self.epoch = epoch + self.train_epoch() + + if self._checkpoint_dir: + self.save_checkpoint() + except: + print('Training crashed at epoch {}'.format(self.epoch)) + if fail_safe: + load_latest = True + print('Restarting training from last epoch ...') + else: + raise + + print('Finished training!') + + def train_epoch(self): + raise NotImplementedError + + def save_checkpoint(self): + """Saves a checkpoint of the network and other variables.""" + actor_type = type(self.actor).__name__ + net_type = type(self.actor.net).__name__ + state = { + 'epoch': self.epoch, + 'actor_type': actor_type, + 'net_type': net_type, + 'net_info': getattr(self.actor.net, 'info', None), + 'constructor': getattr(self.actor.net, 'constructor', None), + 'stats': self.stats, + 'settings': self.settings + } + + directory = '{}/{}/{}_ep{:04d}'.format(self._checkpoint_dir, + self.settings.project_path, + net_type, self.epoch) + if not os.path.exists(directory): + os.makedirs(directory) + + fluid.save_dygraph(self.actor.net.state_dict(), directory) + fluid.save_dygraph(self.optimizer.state_dict(), directory) + with open(os.path.join(directory, '_custom_state.pickle'), 'wb') as f: + pickle.dump(state, f) + + def load_checkpoint(self, checkpoint=None): + """Loads a network checkpoint file. + + Can be called in three different ways: + load_checkpoint(): + Loads the latest epoch from the workspace. Use this to continue training. + load_checkpoint(epoch_num): + Loads the network at the given epoch number (int). + load_checkpoint(path_to_checkpoint): + Loads the file from the given absolute path (str). + """ + + net_type = type(self.actor.net).__name__ + + if checkpoint is None: + # Load most recent checkpoint + checkpoint_list = sorted( + glob.glob('{}/{}/{}_ep*'.format(self._checkpoint_dir, + self.settings.project_path, + net_type))) + if checkpoint_list: + checkpoint_path = checkpoint_list[-1].split('.')[0] + else: + print('No matching checkpoint file found') + return + elif isinstance(checkpoint, int): + # Checkpoint is the epoch number + checkpoint_path = '{}/{}/{}_ep{:04d}'.format( + self._checkpoint_dir, self.settings.project_path, net_type, + checkpoint) + elif isinstance(checkpoint, str): + # checkpoint is the path + checkpoint_path = os.path.expanduser(checkpoint) + else: + raise TypeError + + # paddle load network + net_params, opt_params = fluid.load_dygraph(checkpoint_path) + self.actor.net.load_dict(net_params) + self.optimizer.set_dict(opt_params) + + # paddle load state + state_path = '{}/{}/custom_state.pickle'.format( + self._checkpoint_dir, self.settings.project_path) + current_state = pickle.load( + open(os.path.join(state_path, 'custom_state.pickle'), 'rb')) + + print("\nload checkpoint done !! Current states are as follows:") + for key, value in enumerate(current_state): + print(key, value) + + return True diff --git a/PaddleCV/tracking/ltr/trainers/ltr_trainer.py b/PaddleCV/tracking/ltr/trainers/ltr_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..c8b0380827493d0114e45c3f08e0f03bef827d01 --- /dev/null +++ b/PaddleCV/tracking/ltr/trainers/ltr_trainer.py @@ -0,0 +1,164 @@ +import os +from collections import OrderedDict + +from ltr.trainers import BaseTrainer +from ltr.admin.stats import AverageMeter, StatValue +from ltr.admin.tensorboard import TensorboardWriter +import paddle +import paddle.fluid as fluid +import paddle.fluid.dygraph as dygraph +import time +import numpy as np + + +class LTRTrainer(BaseTrainer): + def __init__(self, actor, loaders, optimizer, settings, lr_scheduler=None): + """ + args: + actor - The actor for training the network + loaders - list of dataset loaders, e.g. [train_loader, val_loader]. In each epoch, the trainer runs one + epoch for each loader. + optimizer - The optimizer used for training, e.g. Adam + settings - Training settings + lr_scheduler - Learning rate scheduler + """ + super().__init__(actor, loaders, optimizer, settings, lr_scheduler) + + self._set_default_settings() + + # Initialize statistics variables + self.stats = OrderedDict({loader.name: None for loader in self.loaders}) + + # Initialize tensorboard + tensorboard_writer_dir = os.path.join(self.settings.env.tensorboard_dir, + self.settings.project_path) + self.tensorboard_writer = TensorboardWriter(tensorboard_writer_dir, + [l.name for l in loaders]) + + def _set_default_settings(self): + # Dict of all default values + default = {'print_interval': 10, 'print_stats': None, 'description': ''} + + for param, default_value in default.items(): + if getattr(self.settings, param, None) is None: + setattr(self.settings, param, default_value) + + def cycle_dataset(self, loader): + """Do a cycle of training or validation.""" + if loader.training: + self.actor.train() + else: + self.actor.eval() + + self._init_timing() + + for i, data in enumerate(loader, 1): + # get inputs + data = self.to_variable(data) + data['epoch'] = self.epoch + data['settings'] = self.settings + + # forward pass + loss, stats = self.actor(data) + + # backward pass and update weights + if loader.training: + loss.backward() + apply_collective_grads = getattr(self.actor.net, + "apply_collective_grads", None) + if callable(apply_collective_grads): + apply_collective_grads() + self.optimizer.minimize(loss) + self.actor.net.clear_gradients() + + # update statistics + batch_size = data['train_images'].shape[loader.stack_dim] + self._update_stats(stats, batch_size, loader) + + self._print_stats(i, loader, batch_size) + + if i % loader.__len__() == 0: + self.save_checkpoint() + self._stats_new_epoch() + self._write_tensorboard() + return + + def to_variable(self, data_dict): + keys = data_dict.keys() + for k in keys: + if k != "dataset": + data_dict[k] = dygraph.to_variable( + np.array(data_dict[k]).astype(np.float32)) + return data_dict + + def to_array(self, data_dict): + keys = data_dict.keys() + for k in keys: + if k != "dataset": + data_dict[k] = data_dict[k].numpy() + return data_dict + + def train_epoch(self): + """Do one epoch for each loader.""" + for loader in self.loaders: + if self.epoch % loader.epoch_interval == 0: + self.cycle_dataset(loader) + + self._stats_new_epoch() + self._write_tensorboard() + print('{}th epoch train / eval done!'.format(self.epoch)) + + def _init_timing(self): + self.num_frames = 0 + self.start_time = time.time() + self.prev_time = self.start_time + + def _update_stats(self, new_stats: OrderedDict, batch_size, loader): + # Initialize stats if not initialized yet + if loader.name not in self.stats.keys() or self.stats[ + loader.name] is None: + self.stats[loader.name] = OrderedDict( + {name: AverageMeter() + for name in new_stats.keys()}) + + for name, val in new_stats.items(): + if name not in self.stats[loader.name].keys(): + self.stats[loader.name][name] = AverageMeter() + self.stats[loader.name][name].update(val, batch_size) + + def _print_stats(self, i, loader, batch_size): + self.num_frames += batch_size + current_time = time.time() + batch_fps = batch_size / (current_time - self.prev_time) + average_fps = self.num_frames / (current_time - self.start_time) + self.prev_time = current_time + if i % self.settings.print_interval == 0 or i == loader.__len__(): + print_str = '[%s: %d, %d / %d] ' % (loader.name, self.epoch, i, + loader.__len__()) + print_str += 'FPS: %.1f (%.1f) , ' % (average_fps, batch_fps) + for name, val in self.stats[loader.name].items(): + if (self.settings.print_stats is None or + name in self.settings.print_stats) and hasattr(val, + 'avg'): + print_str += '%s: %.5f , ' % (name, val.avg) + print_str += '%s: %.5f , ' % ("time", batch_size / batch_fps * + self.settings.print_interval) + print(print_str[:-5]) + + def _stats_new_epoch(self): + for loader_stats in self.stats.values(): + if loader_stats is None: + continue + for stat_value in loader_stats.values(): + if hasattr(stat_value, 'new_epoch'): + stat_value.new_epoch() + + def _write_tensorboard(self): + if self.epoch == 1: + self.tensorboard_writer.write_info(self.settings.module_name, + self.settings.script_name, + self.settings.description) + + self.tensorboard_writer.write_epoch(self.stats, self.epoch) + print('{}/{}'.format(self.settings.module_name, + self.settings.script_name)) diff --git a/PaddleCV/tracking/pytracking/__init__.py b/PaddleCV/tracking/pytracking/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PaddleCV/tracking/pytracking/admin/environment.py b/PaddleCV/tracking/pytracking/admin/environment.py new file mode 100644 index 0000000000000000000000000000000000000000..a32c103bc04a11774147f9abb3a953a123136fc8 --- /dev/null +++ b/PaddleCV/tracking/pytracking/admin/environment.py @@ -0,0 +1,52 @@ +import importlib +import os + + +class EnvSettings: + def __init__(self): + pytracking_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) + + self.results_path = '{}/tracking_results/'.format(pytracking_path) + self.network_path = '{}/networks/'.format(pytracking_path) + self.dataset_path = '{}/benchmark_datasets/'.format(pytracking_path) + + +def create_default_local_file(): + comment = {'results_path': 'Where to store tracking results', + 'dataset_path': 'Where benchmark datasets are stored', + 'network_path': 'Where tracking networks are stored.'} + + path = os.path.join(os.path.dirname(__file__), 'local.py') + with open(path, 'w') as f: + settings = EnvSettings() + + f.write('from pytracking.admin.environment import EnvSettings\n\n') + f.write('def local_env_settings():\n') + f.write(' settings = EnvSettings()\n\n') + f.write(' # Set your local paths here.\n\n') + + for attr in dir(settings): + comment_str = None + if attr in comment: + comment_str = comment[attr] + attr_val = getattr(settings, attr) + if not attr.startswith('__') and not callable(attr_val): + if comment_str is None: + f.write(' settings.{} = \'{}\'\n'.format(attr, attr_val)) + else: + f.write(' settings.{} = \'{}\' # {}\n'.format(attr, attr_val, comment_str)) + f.write('\n return settings\n\n') + + +def env_settings(): + env_module_name = 'pytracking.admin.local' + try: + env_module = importlib.import_module(env_module_name) + return env_module.local_env_settings() + except: + env_file = os.path.join(os.path.dirname(__file__), 'local.py') + + # Create a default file + create_default_local_file() + raise RuntimeError('YOU HAVE NOT SETUP YOUR local.py!!!\n Go to "{}" and set all the paths you need. ' + 'Then try to run again.'.format(env_file)) diff --git a/PaddleCV/tracking/pytracking/admin/local.py b/PaddleCV/tracking/pytracking/admin/local.py new file mode 100644 index 0000000000000000000000000000000000000000..40e8e23f3203e49a15fab9c890eef90300a8d445 --- /dev/null +++ b/PaddleCV/tracking/pytracking/admin/local.py @@ -0,0 +1,13 @@ +from pytracking.admin.environment import EnvSettings + + +def local_env_settings(): + settings = EnvSettings() + + # Set your local paths here. + + settings.dataset_path = '' # Where benchmark datasets are stored + settings.network_path = '' # Where tracking networks are stored. + settings.results_path = '/models/PaddleCV/tracking/pytracking/tracking_results/' # Where to store tracking results + + return settings diff --git a/PaddleCV/tracking/pytracking/eval_benchmark.py b/PaddleCV/tracking/pytracking/eval_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..cf9536eb2944916d33e42cad35770b0043b38042 --- /dev/null +++ b/PaddleCV/tracking/pytracking/eval_benchmark.py @@ -0,0 +1,308 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import importlib +import os +import os.path as osp +import pickle +import sys +from glob import glob + +import cv2 as cv +import numpy as np +from tqdm import tqdm + +CURRENT_DIR = osp.dirname(__file__) +sys.path.append(osp.join(CURRENT_DIR, '..')) + +from pytracking.admin.environment import env_settings +from pytracking.pysot_toolkit.pysot.datasets import DatasetFactory +from pytracking.pysot_toolkit.pysot.evaluation import EAOBenchmark, AccuracyRobustnessBenchmark, OPEBenchmark +from pytracking.pysot_toolkit.pysot.utils.region import vot_overlap + +parser = argparse.ArgumentParser(description='tracking evaluation') + +parser.add_argument('--dataset', '-d', type=str, help='dataset name') +parser.add_argument( + '--training_base_param', '-tr', type=str, help='training base params name') +parser.add_argument('--epoch', '-e', type=str, help='epoch specifications') +parser.add_argument( + '--tracking_base_param', '-te', type=str, help='tracking base params name') +parser.add_argument( + '--num_repeat', '-n', default=1, type=int, help='number of repeat') +parser.add_argument( + '--exp_id', '-ex', default='', type=str, help='experiment id') + +args = parser.parse_args() + + +def read_image(x): + if isinstance(x, str): + img = cv.imread(x) + else: + img = x + return cv.cvtColor(img, cv.COLOR_BGR2RGB) + + +def get_tracker_params(param_module, params): + tracker_params = param_module.parameters() + tracker_params.debug = 0 # disable debug + # change checkpoint path + tracker_params.features.features[0].net_path = params['checkpoint'] + return tracker_params + + +def create_tracker(params): + base_param = params['tracking_base_param'] + base_tracker = base_param.split('.')[0] + param_module = importlib.import_module('pytracking.parameter.{}'.format( + base_param)) + tracker_params = get_tracker_params(param_module, params) + tracker_module = importlib.import_module('pytracking.tracker.{}'.format( + base_tracker)) + tracker_class = tracker_module.get_tracker_class() + return tracker_class(tracker_params) + + +def get_axis_aligned_bbox(region): + region = np.array(region) + if len(region.shape) == 3: + # region (1,4,2) + region = np.array([ + region[0][0][0], region[0][0][1], region[0][1][0], region[0][1][1], + region[0][2][0], region[0][2][1], region[0][3][0], region[0][3][1] + ]) + + cx = np.mean(region[0::2]) + cy = np.mean(region[1::2]) + x1 = min(region[0::2]) + + x2 = max(region[0::2]) + y1 = min(region[1::2]) + y2 = max(region[1::2]) + + A1 = np.linalg.norm(region[0:2] - region[2:4]) * np.linalg.norm(region[ + 2:4] - region[4:6]) + A2 = (x2 - x1) * (y2 - y1) + s = np.sqrt(A1 / A2) + w = s * (x2 - x1) + 1 + h = s * (y2 - y1) + 1 + + x11 = cx - w // 2 + y11 = cy - h // 2 + + return x11, y11, w, h + + +def run_tracker(tracker, video, reset=False): + if reset: + frame_counter = 0 + pred_bboxes = [] + for idx, (img_p, gt_bbox) in enumerate(video): + if idx == frame_counter: + # init your tracker here + image = read_image(img_p) + if len(gt_bbox) == 8: + init_bbox = get_axis_aligned_bbox(gt_bbox) + else: + init_bbox = gt_bbox + tracker.initialize(image, init_bbox) + pred_bboxes.append(1) + elif idx > frame_counter: + # get tracking result here + image = read_image(img_p) + pred_bbox = tracker.track(image) + overlap = vot_overlap(pred_bbox, gt_bbox, + (image.shape[1], image.shape[0])) + if overlap > 0: + # continue tracking + pred_bboxes.append(pred_bbox) + else: + # lost target, restart + pred_bboxes.append(2) + frame_counter = idx + 5 + else: + pred_bboxes.append(0) + else: + pred_bboxes = [] + for idx, (img_p, gt_bbox) in enumerate(video): + if idx == 0: + # init your tracker here + image = read_image(img_p) + if len(gt_bbox) == 8: + init_bbox = get_axis_aligned_bbox(gt_bbox) + else: + init_bbox = gt_bbox + tracker.initialize(image, init_bbox) + pred_bboxes.append(init_bbox) + else: + # get tracking result here + image = read_image(img_p) + pred_bbox = tracker.track(image) + pred_bboxes.append(pred_bbox) + return pred_bboxes + + +def run_one_sequence(video, params, tracker=None): + # idt = multiprocessing.current_process()._identity[0] + # os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(idt % 4) + save_dir = osp.join(params['result_dir'], params['save_dataset_name'], + params['tracking_base_param'], params['exp_id']) + + if tracker is None: + tracker = create_tracker(params) + + if 'VOT' in params['dataset_name']: + save_sub_dir = osp.join(save_dir, 'baseline', video.name) + os.makedirs(save_sub_dir, exist_ok=True) + num_repeat = params.get('num_repeat', 1) + for repeat_idx in range(1, num_repeat + 1): + save_path = osp.join(save_sub_dir, + video.name + '_{:03d}.txt'.format(repeat_idx)) + if osp.exists(save_path): continue + pred_bboxes = run_tracker(tracker, video, reset=True) + + # Save tracking results + with open(save_path, 'w') as f: + outputs = [] + for res in pred_bboxes: + if isinstance(res, int): + outputs.append('{}'.format(res)) + else: + outputs.append('{},{},{},{}'.format(res[0], res[1], res[ + 2], res[3])) + f.write('\n'.join(outputs)) + else: + os.makedirs(save_dir, exist_ok=True) + save_path = osp.join(save_dir, video.name + '.txt') + if osp.exists(save_path): return + pred_bboxes = run_tracker(tracker, video, reset=False) + + # Save tracking results + with open(save_path, 'w') as f: + outputs = [] + for res in pred_bboxes: + outputs.append('{},{},{},{}'.format(res[0], res[1], res[2], res[ + 3])) + f.write('\n'.join(outputs)) + + +def run_one_dataset(dataset, params): + # use the same tracker for all sequences + tracker = create_tracker(params) + # create new tracker for each sequence + # tracker = None + for video in tqdm(list(dataset.videos.values())): + run_one_sequence(video, params, tracker=tracker) + + +def compute_evaluation_metrics(dataset, params): + result_dir = osp.join(params['result_dir'], params['save_dataset_name'], + params['tracking_base_param']) + tracker_name = params['exp_id'] + trackers = [tracker_name] + dataset.set_tracker(result_dir, trackers) + + if 'VOT' in params['dataset_name']: + ar_benchmark = AccuracyRobustnessBenchmark(dataset) + ar_result = {} + ar_result.update(ar_benchmark.eval(trackers)) + + eao_benchmark = EAOBenchmark(dataset) + eao_result = {} + eao_result.update(eao_benchmark.eval(trackers)) + + ar_benchmark.show_result(ar_result, eao_result) + metrics = {'ar': ar_result, 'eao': eao_result} + else: + benchmark = OPEBenchmark(dataset) + success_result = {} + precision_result = {} + success_result.update(benchmark.eval_success(trackers)) + precision_result.update(benchmark.eval_precision(trackers)) + benchmark.show_result(success_result, precision_result) + metrics = {'success': success_result, 'precision': precision_result} + return metrics + + +def save_info(params, metrics): + save_dir = osp.join(params['result_dir'], params['save_dataset_name'], + params['tracking_base_param'], params['exp_id']) + with open(osp.join(save_dir, 'params.pickle'), 'wb') as f: + pickle.dump(params, f) + + with open(osp.join(save_dir, 'metrics.txt'), 'w') as f: + f.write('{}'.format(metrics)) + + +def run_tracking_and_evaluate(params): + """Receive hyperparameters and return the evaluation metric""" + # load dataset + root = os.path.abspath( + osp.join(env_settings().dataset_path, params['save_dataset_name'])) + dataset = DatasetFactory.create_dataset( + name=params['dataset_name'], dataset_root=root) + + run_one_dataset(dataset, params) + metrics = compute_evaluation_metrics(dataset, params) + + return metrics + + +def get_checkpoint_path(training_base_param, epoch): + model_dir = osp.abspath( + osp.join(env_settings().network_path, *training_base_param.split('.'))) + model_names = glob(model_dir + '/*.pdparams') + prefix = '_'.join(model_names[0].split('_')[:-1]) + return osp.join(model_dir, '{}_ep{:04d}'.format(prefix, epoch)) + + +def parse_epoch(epoch_str): + epochs = eval(epoch_str) + try: + iterator = iter(epochs) + except: + if isinstance(epochs, int): + iterator = [epochs] + else: + raise NotImplementedError + return iterator + + +def main(): + for epoch in parse_epoch(args.epoch): + # get checkpoint + checkpoint_pth = get_checkpoint_path(args.training_base_param, epoch) + + if args.exp_id == '': + exp_id = args.training_base_param + '.epoch{}'.format(epoch) + else: + exp_id = args.exp_id + print('=> Evaluating: {}'.format(exp_id)) + + if args.dataset in ['CVPR13', 'OTB50', 'OTB100']: + # for OTB datasets, we save results into the same directory + save_dataset_name = 'OTB100' + else: + save_dataset_name = args.dataset + + # set up parameters + params = { + 'dataset_name': args.dataset, + 'checkpoint': checkpoint_pth, + 'tracking_base_param': args.tracking_base_param, + 'num_repeat': args.num_repeat, + 'exp_id': exp_id, + 'result_dir': env_settings().results_path, + 'save_dataset_name': save_dataset_name, + } + + metrics = run_tracking_and_evaluate(params) + save_info(params, metrics) + + +if __name__ == '__main__': + main() diff --git a/PaddleCV/tracking/pytracking/features/__init__.py b/PaddleCV/tracking/pytracking/features/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PaddleCV/tracking/pytracking/features/augmentation.py b/PaddleCV/tracking/pytracking/features/augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..8171ec59784a4f7f791cf726dd277df88c3ac80b --- /dev/null +++ b/PaddleCV/tracking/pytracking/features/augmentation.py @@ -0,0 +1,205 @@ +import numpy as np +import math + +from paddle.fluid import layers + +import cv2 as cv + +from pytracking.features.preprocessing import numpy_to_paddle, paddle_to_numpy +from pytracking.libs.Fconv2d import Fconv2d +from pytracking.libs.paddle_utils import PTensor, _padding, n2p + + +class Transform: + """Base data augmentation transform class.""" + + def __init__(self, output_sz=None, shift=None): + self.output_sz = output_sz + self.shift = (0, 0) if shift is None else shift + + def __call__(self, image): + raise NotImplementedError + + def crop_to_output(self, image, shift=None): + if isinstance(image, PTensor): + imsz = image.shape[2:] + else: + imsz = image.shape[:2] + + if self.output_sz is None: + pad_h = 0 + pad_w = 0 + else: + pad_h = (self.output_sz[0] - imsz[0]) / 2 + pad_w = (self.output_sz[1] - imsz[1]) / 2 + if shift is None: + shift = self.shift + pad_left = math.floor(pad_w) + shift[1] + pad_right = math.ceil(pad_w) - shift[1] + pad_top = math.floor(pad_h) + shift[0] + pad_bottom = math.ceil(pad_h) - shift[0] + + if isinstance(image, PTensor): + return _padding( + image, (pad_left, pad_right, pad_top, pad_bottom), + mode='replicate') + else: + return _padding( + image, (0, 0, pad_left, pad_right, pad_top, pad_bottom), + mode='replicate') + + +class Identity(Transform): + """Identity transformation.""" + + def __call__(self, image): + return self.crop_to_output(image) + + +class FlipHorizontal(Transform): + """Flip along horizontal axis.""" + + def __call__(self, image): + if isinstance(image, PTensor): + return self.crop_to_output(layers.reverse(image, 3)) + else: + return self.crop_to_output(np.fliplr(image)) + + +class FlipVertical(Transform): + """Flip along vertical axis.""" + + def __call__(self, image: PTensor): + if isinstance(image, PTensor): + return self.crop_to_output(layers.reverse(image, 2)) + else: + return self.crop_to_output(np.flipud(image)) + + +class Translation(Transform): + """Translate.""" + + def __init__(self, translation, output_sz=None, shift=None): + super().__init__(output_sz, shift) + self.shift = (self.shift[0] + translation[0], + self.shift[1] + translation[1]) + + def __call__(self, image): + return self.crop_to_output(image) + + +class Scale(Transform): + """Scale.""" + + def __init__(self, scale_factor, output_sz=None, shift=None): + super().__init__(output_sz, shift) + self.scale_factor = scale_factor + + def __call__(self, image): + # Calculate new size. Ensure that it is even so that crop/pad becomes easier + h_orig, w_orig = image.shape[2:] + + if h_orig != w_orig: + raise NotImplementedError + + h_new = round(h_orig / self.scale_factor) + h_new += (h_new - h_orig) % 2 + w_new = round(w_orig / self.scale_factor) + w_new += (w_new - w_orig) % 2 + + if isinstance(image, PTensor): + image_resized = layers.resize_bilinear( + image, [h_new, w_new], align_corners=False) + else: + image_resized = cv.resize( + image, (w_new, h_new), interpolation=cv.INTER_LINEAR) + return self.crop_to_output(image_resized) + + +class Affine(Transform): + """Affine transformation.""" + + def __init__(self, transform_matrix, output_sz=None, shift=None): + super().__init__(output_sz, shift) + self.transform_matrix = transform_matrix + + def __call__(self, image, crop=True): + if isinstance(image, PTensor): + return self.crop_to_output( + numpy_to_paddle(self( + paddle_to_numpy(image), crop=False))) + else: + warp = cv.warpAffine( + image, + self.transform_matrix, + image.shape[1::-1], + borderMode=cv.BORDER_REPLICATE) + if crop: + return self.crop_to_output(warp) + else: + return warp + + +class Rotate(Transform): + """Rotate with given angle.""" + + def __init__(self, angle, output_sz=None, shift=None): + super().__init__(output_sz, shift) + self.angle = math.pi * angle / 180 + + def __call__(self, image, crop=True): + if isinstance(image, PTensor): + return self.crop_to_output( + numpy_to_paddle(self( + paddle_to_numpy(image), crop=False))) + else: + c = (np.expand_dims(np.array(image.shape[:2]), 1) - 1) / 2 + R = np.array([[math.cos(self.angle), math.sin(self.angle)], + [-math.sin(self.angle), math.cos(self.angle)]]) + H = np.concatenate([R, c - R @c], 1) + warp = cv.warpAffine( + image, H, image.shape[1::-1], borderMode=cv.BORDER_REPLICATE) + if crop: + return self.crop_to_output(warp) + else: + return warp + + +class Blur(Transform): + """Blur with given sigma (can be axis dependent).""" + + def __init__(self, sigma, output_sz=None, shift=None): + super().__init__(output_sz, shift) + if isinstance(sigma, (float, int)): + sigma = (sigma, sigma) + self.sigma = sigma + self.filter_size = [math.ceil(2 * s) for s in self.sigma] + + x_coord = [ + np.arange( + -sz, sz + 1, 1, dtype='float32') for sz in self.filter_size + ] + self.filter_np = [ + np.exp(0 - (x * x) / (2 * s**2)) + for x, s in zip(x_coord, self.sigma) + ] + self.filter_np[0] = np.reshape( + self.filter_np[0], [1, 1, -1, 1]) / np.sum(self.filter_np[0]) + self.filter_np[1] = np.reshape( + self.filter_np[1], [1, 1, 1, -1]) / np.sum(self.filter_np[1]) + + def __call__(self, image): + if isinstance(image, PTensor): + sz = image.shape[2:] + filter = [n2p(f) for f in self.filter_np] + im1 = Fconv2d( + layers.reshape(image, [-1, 1, sz[0], sz[1]]), + filter[0], + padding=(self.filter_size[0], 0)) + return self.crop_to_output( + layers.reshape( + Fconv2d( + im1, filter[1], padding=(0, self.filter_size[1])), + [1, -1, sz[0], sz[1]])) + else: + return paddle_to_numpy(self(numpy_to_paddle(image))) diff --git a/PaddleCV/tracking/pytracking/features/color.py b/PaddleCV/tracking/pytracking/features/color.py new file mode 100644 index 0000000000000000000000000000000000000000..969621011d1f21f52ce8b88b3567ddd32d2d9f2e --- /dev/null +++ b/PaddleCV/tracking/pytracking/features/color.py @@ -0,0 +1,30 @@ +from paddle.fluid import layers +from pytracking.features.featurebase import FeatureBase +from pytracking.libs.paddle_utils import PTensor +import numpy as np + + +class RGB(FeatureBase): + """RGB feature normalized to [-0.5, 0.5].""" + + def dim(self): + return 3 + + def stride(self): + return self.pool_stride + + def extract(self, im: np.ndarray): + return im / 255 - 0.5 + + +class Grayscale(FeatureBase): + """Grayscale feature normalized to [-0.5, 0.5].""" + + def dim(self): + return 1 + + def stride(self): + return self.pool_stride + + def extract(self, im: np.ndarray): + return np.mean(im / 255 - 0.5, 1, keepdims=True) diff --git a/PaddleCV/tracking/pytracking/features/deep.py b/PaddleCV/tracking/pytracking/features/deep.py new file mode 100644 index 0000000000000000000000000000000000000000..376acf6d07b7390a94fb3d9ae830f245d0b0ee27 --- /dev/null +++ b/PaddleCV/tracking/pytracking/features/deep.py @@ -0,0 +1,349 @@ +import os + +import numpy as np +from paddle import fluid + +from ltr.models.bbreg.atom import atom_resnet50, atom_resnet18 +from ltr.models.siamese.siam import siamfc_alexnet +from pytracking.admin.environment import env_settings +from pytracking.features.featurebase import MultiFeatureBase +from pytracking.libs import TensorList +from pytracking.libs.paddle_utils import n2p + + +class ResNet18(MultiFeatureBase): + """ResNet18 feature. + args: + output_layers: List of layers to output. + net_path: Relative or absolute net path (default should be fine). + use_gpu: Use GPU or CPU. + """ + + def __init__(self, + output_layers=('block2', ), + net_path='atom_iou', + use_gpu=True, + *args, + **kwargs): + super().__init__(*args, **kwargs) + + self.output_layers = list(output_layers) + self.use_gpu = use_gpu + self.net_path = net_path + + def initialize(self): + with fluid.dygraph.guard(): + if os.path.isabs(self.net_path): + net_path_full = self.net_path + else: + net_path_full = os.path.join(env_settings().network_path, + self.net_path) + + self.net = atom_resnet18( + backbone_pretrained=False, + backbone_is_test=True, + iounet_is_test=True) + + state_dictsm, _ = fluid.load_dygraph(net_path_full) + self.net.load_dict(state_dictsm) + self.net.train() + + self.iou_predictor = self.net.bb_regressor + + self.layer_stride = { + 'conv0': 2, + 'conv1': 2, + 'block0': 4, + 'block1': 8, + 'block2': 16, + 'block3': 32, + 'classification': 16, + 'fc': None + } + self.layer_dim = { + 'conv0': 64, + 'conv1': 64, + 'block0': 64, + 'block1': 128, + 'block2': 256, + 'block3': 512, + 'classification': 256, + 'fc': None + } + + self.iounet_feature_layers = self.net.bb_regressor_layer + + if isinstance(self.pool_stride, int) and self.pool_stride == 1: + self.pool_stride = [1] * len(self.output_layers) + + self.feature_layers = sorted( + list(set(self.output_layers + self.iounet_feature_layers))) + + self.mean = np.reshape([0.485, 0.456, 0.406], [1, -1, 1, 1]) + self.std = np.reshape([0.229, 0.224, 0.225], [1, -1, 1, 1]) + + def free_memory(self): + if hasattr(self, 'net'): + del self.net + if hasattr(self, 'iou_predictor'): + del self.iou_predictor + if hasattr(self, 'iounet_backbone_features'): + del self.iounet_backbone_features + if hasattr(self, 'iounet_features'): + del self.iounet_features + + def dim(self): + return TensorList([self.layer_dim[l] for l in self.output_layers]) + + def stride(self): + return TensorList([ + s * self.layer_stride[l] + for l, s in zip(self.output_layers, self.pool_stride) + ]) + + def extract(self, im: np.ndarray, debug_save_name=None): + with fluid.dygraph.guard(): + if debug_save_name is not None: + np.savez(debug_save_name, im) + + im = im / 255. # don't use im /= 255. since we don't want to alter the input + im -= self.mean + im /= self.std + im = n2p(im) + + output_features = self.net.extract_features(im, self.feature_layers) + + # Store the raw resnet features which are input to iounet + iounet_backbone_features = TensorList([ + output_features[layer] for layer in self.iounet_feature_layers + ]) + self.iounet_backbone_features = iounet_backbone_features.numpy() + + # Store the processed features from iounet, just before pooling + self.iounet_features = TensorList([ + f.numpy() + for f in self.iou_predictor.get_iou_feat( + iounet_backbone_features) + ]) + + output = TensorList([ + output_features[layer].numpy() for layer in self.output_layers + ]) + return output + + +class ResNet50(MultiFeatureBase): + """ResNet50 feature. + args: + output_layers: List of layers to output. + net_path: Relative or absolute net path (default should be fine). + use_gpu: Use GPU or CPU. + """ + + def __init__(self, + output_layers=('block2', ), + net_path='atom_iou', + use_gpu=True, + *args, + **kwargs): + super().__init__(*args, **kwargs) + + self.output_layers = list(output_layers) + self.use_gpu = use_gpu + self.net_path = net_path + + def initialize(self): + with fluid.dygraph.guard(): + if os.path.isabs(self.net_path): + net_path_full = self.net_path + else: + net_path_full = os.path.join(env_settings().network_path, + self.net_path) + + self.net = atom_resnet50( + backbone_pretrained=False, + backbone_is_test=True, + iounet_is_test=True) + + state_dictsm, _ = fluid.load_dygraph(net_path_full) + self.net.load_dict(state_dictsm) + self.net.train() + + self.iou_predictor = self.net.bb_regressor + + self.layer_stride = { + 'conv0': 2, + 'conv1': 2, + 'block0': 4, + 'block1': 8, + 'block2': 16, + 'block3': 32, + 'classification': 16, + 'fc': None + } + self.layer_dim = { + 'conv0': 64, + 'conv1': 64, + 'block0': 256, + 'block1': 512, + 'block2': 1024, + 'block3': 2048, + 'classification': 256, + 'fc': None + } + + self.iounet_feature_layers = self.net.bb_regressor_layer + + if isinstance(self.pool_stride, int) and self.pool_stride == 1: + self.pool_stride = [1] * len(self.output_layers) + + self.feature_layers = sorted( + list(set(self.output_layers + self.iounet_feature_layers))) + + self.mean = np.reshape([0.485, 0.456, 0.406], [1, -1, 1, 1]) + self.std = np.reshape([0.229, 0.224, 0.225], [1, -1, 1, 1]) + + def free_memory(self): + if hasattr(self, 'net'): + del self.net + if hasattr(self, 'iou_predictor'): + del self.iou_predictor + if hasattr(self, 'iounet_backbone_features'): + del self.iounet_backbone_features + if hasattr(self, 'iounet_features'): + del self.iounet_features + + def dim(self): + return TensorList([self.layer_dim[l] for l in self.output_layers]) + + def stride(self): + return TensorList([ + s * self.layer_stride[l] + for l, s in zip(self.output_layers, self.pool_stride) + ]) + + def extract(self, im: np.ndarray, debug_save_name=None): + with fluid.dygraph.guard(): + if debug_save_name is not None: + np.savez(debug_save_name, im) + + im = im / 255. # don't use im /= 255. since we don't want to alter the input + im -= self.mean + im /= self.std + im = n2p(im) + + output_features = self.net.extract_features(im, self.feature_layers) + + # Store the raw resnet features which are input to iounet + iounet_backbone_features = TensorList([ + output_features[layer] for layer in self.iounet_feature_layers + ]) + self.iounet_backbone_features = iounet_backbone_features.numpy() + + # Store the processed features from iounet, just before pooling + self.iounet_features = TensorList([ + f.numpy() + for f in self.iou_predictor.get_iou_feat( + iounet_backbone_features) + ]) + + output = TensorList([ + output_features[layer].numpy() for layer in self.output_layers + ]) + return output + + +class SFCAlexnet(MultiFeatureBase): + """Alexnet feature. + args: + output_layers: List of layers to output. + net_path: Relative or absolute net path (default should be fine). + use_gpu: Use GPU or CPU. + """ + + def __init__(self, + output_layers=('conv5', ), + net_path='estimator', + use_gpu=True, + *args, + **kwargs): + super().__init__(*args, **kwargs) + + self.output_layers = list(output_layers) + self.use_gpu = use_gpu + self.net_path = net_path + + def initialize(self): + with fluid.dygraph.guard(): + if os.path.isabs(self.net_path): + net_path_full = self.net_path + else: + net_path_full = os.path.join(env_settings().network_path, + self.net_path) + + self.net = siamfc_alexnet( + backbone_pretrained=False, + backbone_is_test=True, + estimator_is_test=True) + + state_dictsm, _ = fluid.load_dygraph(net_path_full) + self.net.load_dict(state_dictsm) + self.net.train() + + self.target_estimator = self.net.target_estimator + + self.layer_stride = {'conv5': 8} + self.layer_dim = {'conv5': 256} + + self.estimator_feature_layers = self.net.target_estimator_layer + + if isinstance(self.pool_stride, int) and self.pool_stride == 1: + self.pool_stride = [1] * len(self.output_layers) + + self.feature_layers = sorted( + list(set(self.output_layers + self.estimator_feature_layers))) + + self.mean = np.reshape([0., 0., 0.], [1, -1, 1, 1]) + self.std = np.reshape([1 / 255., 1 / 255., 1 / 255.], [1, -1, 1, 1]) + + def free_memory(self): + if hasattr(self, 'net'): + del self.net + if hasattr(self, 'target_estimator'): + del self.target_estimator + if hasattr(self, 'estimator_backbone_features'): + del self.estimator_backbone_features + + def dim(self): + return TensorList([self.layer_dim[l] for l in self.output_layers]) + + def stride(self): + return TensorList([ + s * self.layer_stride[l] + for l, s in zip(self.output_layers, self.pool_stride) + ]) + + def extract(self, im: np.ndarray, debug_save_name=None): + with fluid.dygraph.guard(): + if debug_save_name is not None: + np.savez(debug_save_name, im) + + im = im / 255. # don't use im /= 255. since we don't want to alter the input + im -= self.mean + im /= self.std + im = n2p(im) + + output_features = self.net.extract_features(im, self.feature_layers) + + # Store the raw backbone features which are input to estimator + estimator_backbone_features = TensorList([ + output_features[layer] + for layer in self.estimator_feature_layers + ]) + self.estimator_backbone_features = estimator_backbone_features.numpy( + ) + + output = TensorList([ + output_features[layer].numpy() for layer in self.output_layers + ]) + return output diff --git a/PaddleCV/tracking/pytracking/features/extractor.py b/PaddleCV/tracking/pytracking/features/extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..025c915491a6f38dde81622a64accb5e28c27dfc --- /dev/null +++ b/PaddleCV/tracking/pytracking/features/extractor.py @@ -0,0 +1,185 @@ +import numpy as np +from paddle import fluid +from paddle.fluid import layers +from pytracking.features.preprocessing import sample_patch +from pytracking.libs import TensorList + + +class ExtractorBase: + """Base feature extractor class. + args: + features: List of features. + """ + + def __init__(self, features): + self.features = features + + def initialize(self): + for f in self.features: + f.initialize() + + def free_memory(self): + for f in self.features: + f.free_memory() + + +class SingleResolutionExtractor(ExtractorBase): + """Single resolution feature extractor. + args: + features: List of features. + """ + + def __init__(self, features): + super().__init__(features) + + self.feature_stride = self.features[0].stride() + if isinstance(self.feature_stride, (list, TensorList)): + self.feature_stride = self.feature_stride[0] + + def stride(self): + return self.feature_stride + + def size(self, input_sz): + return input_sz // self.stride() + + def extract(self, im, pos, scales, image_sz): + if isinstance(scales, (int, float)): + scales = [scales] + + # Get image patches + im_patches = np.stack( + [sample_patch(im, pos, s * image_sz, image_sz) for s in scales]) + im_patches = np.transpose(im_patches, (0, 3, 1, 2)) + + # Compute features + feature_map = layers.concat( + TensorList( + [f.get_feature(im_patches) for f in self.features]).unroll(), + axis=1) + + return feature_map + + +class MultiResolutionExtractor(ExtractorBase): + """Multi-resolution feature extractor. + args: + features: List of features. + """ + + def __init__(self, features): + super().__init__(features) + self.is_color = None + + def stride(self): + return TensorList( + [f.stride() for f in self.features + if self._return_feature(f)]).unroll() + + def size(self, input_sz): + return TensorList([ + f.size(input_sz) for f in self.features if self._return_feature(f) + ]).unroll() + + def dim(self): + return TensorList( + [f.dim() for f in self.features + if self._return_feature(f)]).unroll() + + def get_fparams(self, name: str=None): + if name is None: + return [f.fparams for f in self.features if self._return_feature(f)] + return TensorList([ + getattr(f.fparams, name) for f in self.features + if self._return_feature(f) + ]).unroll() + + def get_attribute(self, name: str, ignore_missing: bool=False): + if ignore_missing: + return TensorList([ + getattr(f, name) for f in self.features + if self._return_feature(f) and hasattr(f, name) + ]) + else: + return TensorList([ + getattr(f, name, None) for f in self.features + if self._return_feature(f) + ]) + + def get_unique_attribute(self, name: str): + feat = None + for f in self.features: + if self._return_feature(f) and hasattr(f, name): + if feat is not None: + raise RuntimeError('The attribute was not unique.') + feat = f + if feat is None: + raise RuntimeError('The attribute did not exist') + return getattr(feat, name) + + def _return_feature(self, f): + return self.is_color is None or self.is_color and f.use_for_color or not self.is_color and f.use_for_gray + + def set_is_color(self, is_color: bool): + self.is_color = is_color + + def extract(self, im, pos, scales, image_sz, debug_save_name=None): + """Extract features. + args: + im: Image. + pos: Center position for extraction. + scales: Image scales to extract features from. + image_sz: Size to resize the image samples to before extraction. + """ + if isinstance(scales, (int, float)): + scales = [scales] + + # Get image patches + with fluid.dygraph.guard(fluid.CPUPlace()): + im_patches = np.stack([ + sample_patch(im, pos, s * image_sz, image_sz) for s in scales + ]) + + if debug_save_name is not None: + np.save(debug_save_name, im_patches) + + im_patches = np.transpose(im_patches, (0, 3, 1, 2)) + + # Compute features + feature_map = TensorList( + [f.get_feature(im_patches) for f in self.features]).unroll() + + return feature_map + + def extract_transformed(self, + im, + pos, + scale, + image_sz, + transforms, + debug_save_name=None): + """Extract features from a set of transformed image samples. + args: + im: Image. + pos: Center position for extraction. + scale: Image scale to extract features from. + image_sz: Size to resize the image samples to before extraction. + transforms: A set of image transforms to apply. + """ + + # Get image patche + im_patch = sample_patch(im, pos, scale * image_sz, image_sz) + + # Apply transforms + with fluid.dygraph.guard(fluid.CPUPlace()): + im_patches = np.stack([T(im_patch) for T in transforms]) + + if debug_save_name is not None: + np.save(debug_save_name, im_patches) + + im_patches = np.transpose(im_patches, (0, 3, 1, 2)) + + # Compute features + feature_map = TensorList( + [f.get_feature(im_patches) for f in self.features]).unroll() + + return feature_map diff --git a/PaddleCV/tracking/pytracking/features/featurebase.py b/PaddleCV/tracking/pytracking/features/featurebase.py new file mode 100644 index 0000000000000000000000000000000000000000..a9ca9cd1fba0ee0009d13ed6c498e416cf3dedf9 --- /dev/null +++ b/PaddleCV/tracking/pytracking/features/featurebase.py @@ -0,0 +1,158 @@ +from paddle import fluid +from paddle.fluid import layers +from pytracking.libs import TensorList +from pytracking.libs.paddle_utils import floordiv, n2p, broadcast_op + +import numpy as np + + +class FeatureBase: + """Base feature class. + args: + fparams: Feature specific parameters. + pool_stride: Amount of average pooling to apply do downsample the feature map. + output_size: Alternatively, specify the output size of the feature map. Adaptive average pooling will be applied. + normalize_power: The power exponent for the normalization. None means no normalization (default). + use_for_color: Use this feature for color images. + use_for_gray: Use this feature for grayscale images. + """ + + def __init__(self, + fparams=None, + pool_stride=None, + output_size=None, + normalize_power=None, + use_for_color=True, + use_for_gray=True): + self.fparams = fparams + self.pool_stride = 1 if pool_stride is None else pool_stride + self.output_size = output_size + self.normalize_power = normalize_power + self.use_for_color = use_for_color + self.use_for_gray = use_for_gray + + def initialize(self): + pass + + def free_memory(self): + pass + + def dim(self): + raise NotImplementedError + + def stride(self): + raise NotImplementedError + + def size(self, im_sz): + if self.output_size is None: + return floordiv(im_sz, self.stride()) + return self.output_size + + def extract(self, im): + """Performs feature extraction.""" + raise NotImplementedError + + def get_feature(self, im: np.ndarray): + """Get the feature. Generally, call this function. + args: + im: image patch + """ + + # Return empty tensor if it should not be used + is_color = im.shape[1] == 3 + if is_color and not self.use_for_color or not is_color and not self.use_for_gray: + return np.array([]) + + # Extract feature + feat = self.extract(im) + + # Pool/downsample + with fluid.dygraph.guard(): + feat = n2p(feat) + + if self.output_size is not None: + feat = layers.adaptive_pool2d(feat, self.output_size, 'avg') + elif self.pool_stride != 1: + feat = layers.pool2d( + feat, + self.pool_stride, + pool_stride=self.pool_stride, + pool_type='avg') + + # Normalize + if self.normalize_power is not None: + feat /= ( + layers.reduce_sum( + layers.reshape( + layers.abs(feat), [feat.shape[0], 1, 1, -1])** + self.normalize_power, + dim=3, + keep_dim=True) / + (feat.shape[1] * feat.shape[2] * feat.shape[3]) + 1e-10)**( + 1 / self.normalize_power) + + feat = feat.numpy() + return feat + + +class MultiFeatureBase(FeatureBase): + """Base class for features potentially having multiple feature blocks as output (like CNNs). + See FeatureBase for more info. + """ + + def size(self, im_sz): + if self.output_size is None: + return TensorList([floordiv(im_sz, s) for s in self.stride()]) + if isinstance(im_sz, PTensor): + return TensorList([ + floordiv(im_sz, s) if sz is None else np.array([sz[0], sz[1]]) + for sz, s in zip(self.output_size, self.stride()) + ]) + + def get_feature(self, im: np.ndarray): + """Get the feature. Generally, call this function. + args: + im: image patch + """ + + # Return empty tensor if it should not be used + is_color = im.shape[1] == 3 + if is_color and not self.use_for_color or not is_color and not self.use_for_gray: + return np.array([]) + + feat_list = self.extract(im) + + output_sz = [None] * len( + feat_list) if self.output_size is None else self.output_size + + # Pool/downsample + with fluid.dygraph.guard(): + feat_list = [n2p(f) for f in feat_list] + + for i, (sz, s) in enumerate(zip(output_sz, self.pool_stride)): + if sz is not None: + feat_list[i] = layers.adaptive_pool2d( + feat_list[i], sz, pool_type='avg') + elif s != 1: + feat_list[i] = layers.pool2d( + feat_list[i], s, pool_stride=s, pool_type='avg') + + # Normalize + if self.normalize_power is not None: + new_feat_list = [] + for feat in feat_list: + norm = (layers.reduce_sum( + layers.reshape( + layers.abs(feat), [feat.shape[0], 1, 1, -1])** + self.normalize_power, + dim=3, + keep_dim=True) / + (feat.shape[1] * feat.shape[2] * feat.shape[3] + ) + 1e-10)**(1 / self.normalize_power) + feat = broadcast_op(feat, norm, 'div') + new_feat_list.append(feat) + feat_list = new_feat_list + + # To numpy + feat_list = TensorList([f.numpy() for f in feat_list]) + return feat_list diff --git a/PaddleCV/tracking/pytracking/features/preprocessing.py b/PaddleCV/tracking/pytracking/features/preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..280f40e2753719c13d6635eaf78296513a602de4 --- /dev/null +++ b/PaddleCV/tracking/pytracking/features/preprocessing.py @@ -0,0 +1,120 @@ +import numpy as np +import cv2 as cv +from paddle.fluid import dygraph +from paddle.fluid import layers +from pytracking.libs.paddle_utils import PTensor, n2p, _padding, squeeze, unsqueeze + + +def numpy_to_paddle(a: np.ndarray): + return unsqueeze( + layers.transpose( + layers.cast(dygraph.to_variable(a), 'float32'), [2, 0, 1]), [0]) + + +def paddle_to_numpy(a: PTensor): + return layers.transpose(squeeze(a, [0]), [1, 2, 0]).numpy() + + +def sample_patch(im: np.ndarray, + pos: np.ndarray, + sample_sz: np.ndarray, + output_sz: np.ndarray=None): + """Sample an image patch. + + args: + im: Image + pos: center position of crop + sample_sz: size to crop + output_sz: size to resize to + """ + + # copy and convert + posl = pos.astype('long') + + # Compute pre-downsampling factor + if output_sz is not None: + resize_factor = np.min( + sample_sz.astype('float32') / output_sz.astype('float32')) + df = int(max(int(resize_factor - 0.1), 1)) + else: + df = int(1) + + sz = sample_sz.astype('float32') / df # new size + + # Do downsampling + if df > 1: + os = posl % df # offset + posl = ((posl - os) / df).astype('long') # new position + im2 = im[os[0]::df, os[1]::df] # downsample + else: + im2 = im + + # compute size to crop + szl = np.maximum( + np.round(sz), np.array( + [2., 2.], dtype='float32')).astype('long') + + # Extract top and bottom coordinates + tl = posl - (szl - 1) // 2 + br = posl + szl // 2 + + # Get image patch + im_patch = _padding( + im2, (0, 0, -tl[1], br[1] - im2.shape[1] + 1, -tl[0], + br[0] - im2.shape[0] + 1), + mode='replicate') + + if output_sz is None or (im_patch.shape[0] == output_sz[0] and + im_patch.shape[1] == output_sz[1]): + return im_patch + + # Resample + osz = output_sz.astype('long') + im_patch = cv.resize( + im_patch, (osz[1], osz[0]), interpolation=cv.INTER_LINEAR) + return im_patch + + +def sample_patch_with_mean_pad(im: np.ndarray, + pos: np.ndarray, + sample_sz: np.ndarray, + output_sz: np.ndarray=None): + """Sample an image patch. + + args: + im: Image + pos: center position of crop + sample_sz: size to crop + output_sz: size to resize to + """ + + # copy and convert + # posl = np.round(pos).astype('long') # TODO: maybe we should use round + posl = pos.astype('long') + + im2 = im + sz = sample_sz.astype('float32') + # compute size to crop + szl = np.maximum( + np.round(sz), np.array( + [2., 2.], dtype='float32')).astype('long') + + # Extract top and bottom coordinates + tl = posl - (szl - 1) // 2 + br = posl + szl // 2 + + # Get image patch + im_patch = _padding( + im2, (0, 0, -tl[1], br[1] - im2.shape[1] + 1, -tl[0], + br[0] - im2.shape[0] + 1), + mode='replicate') + + if output_sz is None or (im_patch.shape[0] == output_sz[0] and + im_patch.shape[1] == output_sz[1]): + return im_patch + + # Resample + osz = output_sz.astype('long') + im_patch = cv.resize( + im_patch, (osz[1], osz[0]), interpolation=cv.INTER_LINEAR) + return im_patch diff --git a/PaddleCV/tracking/pytracking/libs/Fconv2d.py b/PaddleCV/tracking/pytracking/libs/Fconv2d.py new file mode 100644 index 0000000000000000000000000000000000000000..6d926b53934ac4b5b91f1d3859f00afffde8d7a2 --- /dev/null +++ b/PaddleCV/tracking/pytracking/libs/Fconv2d.py @@ -0,0 +1,259 @@ +from __future__ import print_function +import paddle +import paddle.fluid as fluid + +from paddle.fluid import core + +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.dygraph.layer_object_helper import LayerObjectHelper + +from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer + +from paddle.fluid.param_attr import ParamAttr + +from paddle.fluid.framework import Variable, OpProtoHolder, in_dygraph_mode +from paddle.fluid.layers import utils +import numpy as np + +import paddle +import paddle.fluid as fluid + +from paddle.fluid import core + +from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer + +from paddle.fluid.dygraph import dygraph_utils + +from paddle.fluid.framework import Variable, OpProtoHolder, in_dygraph_mode +from paddle.fluid.layers import utils + + +def Fconv2d( + input, + filter, + stride=1, + padding=0, + dilation=1, + groups=1, + use_cudnn=True, ): + """ + Similar with conv2d, this is a convolution2D layers. Difference + is filter can be token as input directly instead of setting filter size + and number of fliters. Filter is a 4-D tensor with shape + [num_filter, num_channel, filter_size_h, filter_size_w]. + Args: + input (Variable): The input image with [N, C, H, W] format. + filter(Variable): The input filter with [out_channels, in_channels, H, W] format. + stride (int|tuple): The stride size. If stride is a tuple, it must + contain two integers, (stride_H, stride_W). Otherwise, the + stride_H = stride_W = stride. Default: stride = 1. + padding (int|tuple): The padding size. If padding is a tuple, it must + contain two integers, (padding_H, padding_W). Otherwise, the + padding_H = padding_W = padding. Default: padding = 0. + dilation (int|tuple): The dilation size. If dilation is a tuple, it must + contain two integers, (dilation_H, dilation_W). Otherwise, the + dilation_H = dilation_W = dilation. Default: dilation = 1. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv2d + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + act (str): Activation type, if it is set to None, activation is not appended. + Default: None + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Default: None + Returns: + Variable: The tensor variable storing the convolution and \ + non-linearity activation result. + Raises: + ValueError: If the shapes of input, filter_size, stride, padding and + groups mismatch. + Examples: + .. code-block:: python + data = fluid.layers.data(name='data', shape=[3, 32, 32], \ + dtype='float32') + filter = fluid.layers.data(name='filter',shape=[10,3,3,3], \ + dtype='float32',append_batch_size=False) + conv2d = fluid.layers.conv2d(input=data, + filter=filter, + act="relu") + """ + conv_with_filter = Conv2D( + stride=stride, padding=padding, dilation=dilation, groups=groups) + return conv_with_filter(input, filter) + + +class Conv2D(fluid.dygraph.layers.Layer): + """ + This interface is used to construct a callable object of the ``Conv2D`` class. + For more details, refer to code examples. + The convolution2D layer calculates the output based on the input, filter + and strides, paddings, dilations, groups parameters. Input and + Output are in NCHW format, where N is batch size, C is the number of + the feature map, H is the height of the feature map, and W is the width of the feature map. + Filter's shape is [MCHW] , where M is the number of output feature map, + C is the number of input feature map, H is the height of the filter, + and W is the width of the filter. If the groups is greater than 1, + C will equal the number of input feature map divided by the groups. + Please refer to UFLDL's `convolution + `_ + for more detials. + If bias attribution and activation type are provided, bias is added to the + output of the convolution, and the corresponding activation function is + applied to the final result. + For each input :math:`X`, the equation is: + .. math:: + Out = \\sigma (W \\ast X + b) + Where: + * :math:`X`: Input value, a ``Tensor`` with NCHW format. + * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] . + * :math:`\\ast`: Convolution operation. + * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1]. + * :math:`\\sigma`: Activation function. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + Example: + - Input: + Input shape: :math:`(N, C_{in}, H_{in}, W_{in})` + Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)` + - Output: + Output shape: :math:`(N, C_{out}, H_{out}, W_{out})` + Where + .. math:: + H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\ + W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1 + Parameters: + num_channels(int): The number of channels in the input image. + num_filters(int): The number of filter. It is as same as the output + feature map. + filter_size (int or tuple): The filter size. If filter_size is a tuple, + it must contain two integers, (filter_size_H, filter_size_W). + Otherwise, the filter will be a square. + stride (int or tuple, optional): The stride size. If stride is a tuple, it must + contain two integers, (stride_H, stride_W). Otherwise, the + stride_H = stride_W = stride. Default: 1. + padding (int or tuple, optional): The padding size. If padding is a tuple, it must + contain two integers, (padding_H, padding_W). Otherwise, the + padding_H = padding_W = padding. Default: 0. + dilation (int or tuple, optional): The dilation size. If dilation is a tuple, it must + contain two integers, (dilation_H, dilation_W). Otherwise, the + dilation_H = dilation_W = dilation. Default: 1. + groups (int, optional): The groups number of the Conv2d Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. Default: 1. + param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter) + of conv2d. If it is set to None or one attribute of ParamAttr, conv2d + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with :math:`Normal(0.0, std)`, + and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None. + bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv2d. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv2d + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True. + act (str, optional): Activation type, if it is set to None, activation is not appended. + Default: None. + dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32". + Attribute: + **weight** (Parameter): the learnable weights of filter of this layer. + **bias** (Parameter or None): the learnable bias of this layer. + Returns: + None + + Raises: + ValueError: if ``use_cudnn`` is not a bool value. + Examples: + .. code-block:: python + from paddle.fluid.dygraph.base import to_variable + import paddle.fluid as fluid + from paddle.fluid.dygraph import Conv2D + import numpy as np + data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') + with fluid.dygraph.guard(): + conv2d = Conv2D(3, 2, 3) + data = to_variable(data) + conv = conv2d(data) + """ + + def __init__(self, + stride=1, + padding=0, + dilation=1, + groups=None, + use_cudnn=True, + act=None, + dtype='float32'): + super(Conv2D, self).__init__() + self._groups = groups + self._stride = utils.convert_to_list(stride, 2, 'stride') + self._padding = utils.convert_to_list(padding, 2, 'padding') + self._dilation = utils.convert_to_list(dilation, 2, 'dilation') + self._act = act + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + self._use_cudnn = use_cudnn + self._dtype = dtype + + # TODO: recover the usage of depthwise_conv2d when it's + # kernel fixed https://github.com/PaddlePaddle/Paddle/issues/17098 + # if (self._num_channels == self._groups and + # num_filters % self._num_channels == 0 and not self._use_cudnn): + # self._l_type = 'depthwise_conv2d' + # else: + # self._l_type = 'conv2d' + self._l_type = 'conv2d' + + def forward(self, input, weight, bias=None): + inputs = { + 'Input': [input], + 'Filter': [weight], + } + attrs = { + 'strides': self._stride, + 'paddings': self._padding, + 'dilations': self._dilation, + 'groups': self._groups if self._groups else 1, + 'use_cudnn': self._use_cudnn, + 'use_mkldnn': False, + } + + if in_dygraph_mode(): + outs = core.ops.conv2d(inputs, attrs) + pre_bias = outs['Output'][0] + + pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1) + + return dygraph_utils._append_activation_in_dygraph(pre_act, + self._act) + + pre_bias = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + + self._helper.append_op( + type=self._l_type, + inputs={ + 'Input': input, + 'Filter': weight, + }, + outputs={"Output": pre_bias}, + attrs=attrs) + + if bias is not None: + pre_act = self._helper.create_variable_for_type_inference( + dtype=self._dtype) + self._helper.append_op( + type='elementwise_add', + inputs={'X': [pre_bias], + 'Y': [bias]}, + outputs={'Out': [pre_act]}, + attrs={'axis': 1}) + else: + pre_act = pre_bias + + # Currently, we don't support inplace in dygraph mode + return self._helper.append_activation(pre_act, act=self._act) diff --git a/PaddleCV/tracking/pytracking/libs/Fconv2d_static.py b/PaddleCV/tracking/pytracking/libs/Fconv2d_static.py new file mode 100644 index 0000000000000000000000000000000000000000..d62edb7083d67559c7c6c0304be976a163a00117 --- /dev/null +++ b/PaddleCV/tracking/pytracking/libs/Fconv2d_static.py @@ -0,0 +1,173 @@ +from __future__ import print_function +import paddle +import paddle.fluid as fluid + +from paddle.fluid import core + +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.dygraph.layer_object_helper import LayerObjectHelper + +from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer + +from paddle.fluid.param_attr import ParamAttr + +from paddle.fluid.framework import Variable, OpProtoHolder, in_dygraph_mode +from paddle.fluid.layers import utils +import numpy as np + + +def Fconv2d(input, + filter, + stride=1, + padding=0, + dilation=1, + groups=None, + use_cudnn=True, + name=None): + """ + Similar with conv2d, this is a convolution2D layers. Difference + is filter can be token as input directly instead of setting filter size + and number of fliters. Filter is a 4-D tensor with shape + [num_filter, num_channel, filter_size_h, filter_size_w]. + Args: + input (Variable): The input image with [N, C, H, W] format. + filter(Variable): The input filter with [out_channels, in_channels, H, W] format. + stride (int|tuple): The stride size. If stride is a tuple, it must + contain two integers, (stride_H, stride_W). Otherwise, the + stride_H = stride_W = stride. Default: stride = 1. + padding (int|tuple): The padding size. If padding is a tuple, it must + contain two integers, (padding_H, padding_W). Otherwise, the + padding_H = padding_W = padding. Default: padding = 0. + dilation (int|tuple): The dilation size. If dilation is a tuple, it must + contain two integers, (dilation_H, dilation_W). Otherwise, the + dilation_H = dilation_W = dilation. Default: dilation = 1. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv2d + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + act (str): Activation type, if it is set to None, activation is not appended. + Default: None + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. Default: None + Returns: + Variable: The tensor variable storing the convolution and \ + non-linearity activation result. + Raises: + ValueError: If the shapes of input, filter_size, stride, padding and + groups mismatch. + Examples: + .. code-block:: python + data = fluid.layers.data(name='data', shape=[3, 32, 32], \ + dtype='float32') + filter = fluid.layers.data(name='filter',shape=[10,3,3,3], \ + dtype='float32',append_batch_size=False) + conv2d = fluid.layers.conv2d(input=data, + filter=filter, + act="relu") + """ + helper = LayerHelper("conv2d_with_filter", **locals()) + num_channels = input.shape[1] + num_filters = filter.shape[0] + num_filter_channels = filter.shape[1] + l_type = 'conv2d' + # if (num_channels == groups and + if (num_channels == groups and num_filters % num_channels == 0 and + not use_cudnn): + l_type = 'depthwise_conv2d' + if groups is None: + assert num_filter_channels == num_channels + groups = 1 + else: + if num_channels % groups != 0: + raise ValueError("num_channels must be divisible by groups.") + if num_channels // groups != num_filter_channels: + raise ValueError("num_filter_channels must equal to num_channels\ + divided by groups.") + + stride = utils.convert_to_list(stride, 2, 'stride') + padding = utils.convert_to_list(padding, 2, 'padding') + dilation = utils.convert_to_list(dilation, 2, 'dilation') + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype) + helper.append_op( + type=l_type, + inputs={ + 'Input': input, + 'Filter': filter, + }, + outputs={"Output": pre_bias}, + attrs={ + 'strides': stride, + 'paddings': padding, + 'dilations': dilation, + 'groups': groups, + 'use_cudnn': use_cudnn, + 'use_mkldnn': False + }) + + return pre_bias + + +def test_conv2d_with_filter(): + exemplar = np.random.random((8, 4, 6, 6)).astype(np.float32) + instance = np.random.random((8, 4, 22, 22)).astype(np.float32) + + # fluid.layers.data(append_batch_size=) + use_gpu = False + place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() + + train_program = fluid.Program() + start_program = fluid.Program() + + with fluid.program_guard(train_program, start_program): + x = fluid.layers.data( + name="inst", shape=[8, 4, 22, 22], append_batch_size=False) + y = fluid.layers.data( + name="exem", shape=[8, 4, 6, 6], append_batch_size=False) + bias_att = fluid.ParamAttr( + name="bias_", initializer=fluid.initializer.ConstantInitializer(1.)) + out = conv2d_with_filter(x, y, groups=1) + weight_att = fluid.ParamAttr( + name='weight', + initializer=fluid.initializer.NumpyArrayInitializer(exemplar)) + bias_att = fluid.ParamAttr( + name="bias", initializer=fluid.initializer.ConstantInitializer(0.)) + res = fluid.layers.conv2d( + x, + 8, + 6, + param_attr=weight_att, + bias_attr=bias_att, + stride=1, + padding=0, + dilation=1) + + exe = fluid.Executor(place) + exe.run(program=fluid.default_startup_program()) + print(out.shape) + + compiled_prog = fluid.compiler.CompiledProgram(train_program) + out, res = exe.run(compiled_prog, + feed={"inst": instance, + "exem": exemplar}, + fetch_list=[out.name, res.name]) + + print(np.sum(out - res)) + np.testing.assert_allclose(out, res, rtol=1e-5, atol=0) + + with fluid.dygraph.guard(): + exem = fluid.dygraph.to_variable(exemplar) + inst = fluid.dygraph.to_variable(instance) + + out = conv2d_with_filter(inst, exem, groups=1) + + print(np.sum(out.numpy() - res)) + np.testing.assert_allclose(out.numpy(), res, rtol=1e-5, atol=0) + + +if __name__ == '__main__': + test_conv2d_with_filter() diff --git a/PaddleCV/tracking/pytracking/libs/__init__.py b/PaddleCV/tracking/pytracking/libs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d0cb37ab5fe647283c1fb035260b4681a1ac4fd6 --- /dev/null +++ b/PaddleCV/tracking/pytracking/libs/__init__.py @@ -0,0 +1,2 @@ +from .tensorlist import TensorList +from .tensordict import TensorDict diff --git a/PaddleCV/tracking/pytracking/libs/complex.py b/PaddleCV/tracking/pytracking/libs/complex.py new file mode 100644 index 0000000000000000000000000000000000000000..1de88ef856eb7478dc27e08aa3c2bf392891e9b3 --- /dev/null +++ b/PaddleCV/tracking/pytracking/libs/complex.py @@ -0,0 +1,212 @@ +import numpy as np +from pytracking.libs.tensorlist import tensor_operation + + +def is_complex(a: np.array) -> bool: + return a.ndim >= 4 and a.shape[-1] == 2 + + +def is_real(a: np.array) -> bool: + return not is_complex(a) + + +@tensor_operation +def mult(a: np.array, b: np.array): + """Pointwise complex multiplication of complex tensors.""" + + if is_real(a): + if a.ndim >= b.ndim: + raise ValueError('Incorrect dimensions.') + # a is real + return mult_real_cplx(a, b) + if is_real(b): + if b.ndim >= a.ndim: + raise ValueError('Incorrect dimensions.') + # b is real + return mult_real_cplx(b, a) + + # Both complex + c = mult_real_cplx(a[..., 0], b) + c[..., 0] -= a[..., 1] * b[..., 1] + c[..., 1] += a[..., 1] * b[..., 0] + return c + + +@tensor_operation +def mult_conj(a: np.array, b: np.array): + """Pointwise complex multiplication of complex tensors, with conjugate on b: a*conj(b).""" + + if is_real(a): + if a.ndim >= b.ndim: + raise ValueError('Incorrect dimensions.') + # a is real + return mult_real_cplx(a, conj(b)) + if is_real(b): + if b.ndim >= a.ndim: + raise ValueError('Incorrect dimensions.') + # b is real + return mult_real_cplx(b, a) + + # Both complex + c = mult_real_cplx(b[..., 0], a) + c[..., 0] += a[..., 1] * b[..., 1] + c[..., 1] -= a[..., 0] * b[..., 1] + return c + + +@tensor_operation +def mult_real_cplx(a: np.array, b: np.array): + """Pointwise complex multiplication of real tensor a with complex tensor b.""" + + if is_real(b): + raise ValueError('Last dimension must have length 2.') + + return np.expand_dims(a, -1) * b + + +@tensor_operation +def div(a: np.array, b: np.array): + """Pointwise complex division of complex tensors.""" + + if is_real(b): + if b.ndim >= a.ndim: + raise ValueError('Incorrect dimensions.') + # b is real + return div_cplx_real(a, b) + + return div_cplx_real(mult_conj(a, b), abs_sqr(b)) + + +@tensor_operation +def div_cplx_real(a: np.array, b: np.array): + """Pointwise complex division of complex tensor a with real tensor b.""" + + if is_real(a): + raise ValueError('Last dimension must have length 2.') + + return a / np.expand_dims(b, -1) + + +@tensor_operation +def abs_sqr(a: np.array): + """Squared absolute value.""" + + if is_real(a): + raise ValueError('Last dimension must have length 2.') + + return np.sum(a * a, -1) + + +@tensor_operation +def abs(a: np.array): + """Absolute value.""" + + if is_real(a): + raise ValueError('Last dimension must have length 2.') + + return np.sqrt(abs_sqr(a)) + + +@tensor_operation +def conj(a: np.array): + """Complex conjugate.""" + + if is_real(a): + raise ValueError('Last dimension must have length 2.') + + # return a * np.array([1, -1], device=a.device) + return complex(a[..., 0], -a[..., 1]) + + +@tensor_operation +def real(a: np.array): + """Real part.""" + + if is_real(a): + raise ValueError('Last dimension must have length 2.') + + return a[..., 0] + + +@tensor_operation +def imag(a: np.array): + """Imaginary part.""" + + if is_real(a): + raise ValueError('Last dimension must have length 2.') + + return a[..., 1] + + +@tensor_operation +def complex(a: np.array, b: np.array=None): + """Create complex tensor from real and imaginary part.""" + + if b is None: + b = np.zeros(a.shape, a.dtype) + elif a is None: + a = np.zeros(b.shape, b.dtype) + + return np.concatenate((np.expand_dims(a, -1), np.expand_dims(b, -1)), -1) + + +@tensor_operation +def mtimes(a: np.array, b: np.array, conj_a=False, conj_b=False): + """Complex matrix multiplication of complex tensors. + The dimensions (-3, -2) are matrix multiplied. -1 is the complex dimension.""" + + if is_real(a): + if a.ndim >= b.ndim: + raise ValueError('Incorrect dimensions.') + return mtimes_real_complex(a, b, conj_b=conj_b) + if is_real(b): + if b.ndim >= a.ndim: + raise ValueError('Incorrect dimensions.') + return mtimes_complex_real(a, b, conj_a=conj_a) + + if not conj_a and not conj_b: + return complex( + np.matmul(a[..., 0], b[..., 0]) - np.matmul(a[..., 1], b[..., 1]), + np.matmul(a[..., 0], b[..., 1]) + np.matmul(a[..., 1], b[..., 0])) + if conj_a and not conj_b: + return complex( + np.matmul(a[..., 0], b[..., 0]) + np.matmul(a[..., 1], b[..., 1]), + np.matmul(a[..., 0], b[..., 1]) - np.matmul(a[..., 1], b[..., 0])) + if not conj_a and conj_b: + return complex( + np.matmul(a[..., 0], b[..., 0]) + np.matmul(a[..., 1], b[..., 1]), + np.matmul(a[..., 1], b[..., 0]) - np.matmul(a[..., 0], b[..., 1])) + if conj_a and conj_b: + return complex( + np.matmul(a[..., 0], b[..., 0]) - np.matmul(a[..., 1], b[..., 1]), + -np.matmul(a[..., 0], b[..., 1]) - np.matmul(a[..., 1], b[..., 0])) + + +@tensor_operation +def mtimes_real_complex(a: np.array, b: np.array, conj_b=False): + if is_real(b): + raise ValueError('Incorrect dimensions.') + + if not conj_b: + return complex(np.matmul(a, b[..., 0]), np.matmul(a, b[..., 1])) + if conj_b: + return complex(np.matmul(a, b[..., 0]), -np.matmul(a, b[..., 1])) + + +@tensor_operation +def mtimes_complex_real(a: np.array, b: np.array, conj_a=False): + if is_real(a): + raise ValueError('Incorrect dimensions.') + + if not conj_a: + return complex(np.matmul(a[..., 0], b), np.matmul(a[..., 1], b)) + if conj_a: + return complex(np.matmul(a[..., 0], b), -np.matmul(a[..., 1], b)) + + +@tensor_operation +def exp_imag(a: np.array): + """Complex exponential with imaginary input: e^(i*a)""" + + a = np.expand_dims(a, -1) + return np.concatenate((np.cos(a), np.sin(a)), -1) diff --git a/PaddleCV/tracking/pytracking/libs/dcf.py b/PaddleCV/tracking/pytracking/libs/dcf.py new file mode 100644 index 0000000000000000000000000000000000000000..4aaa70c54253d80a7a1d8e470c6e840464a3b23c --- /dev/null +++ b/PaddleCV/tracking/pytracking/libs/dcf.py @@ -0,0 +1,137 @@ +import math +import numpy as np +from pytracking.libs import fourier +from pytracking.libs import complex +from pytracking.libs.paddle_utils import _padding + + +def hann1d(sz: int, centered=True) -> np.ndarray: + """1D cosine window.""" + if centered: + return 0.5 * (1 - np.cos( + (2 * math.pi / (sz + 2)) * np.arange(1, sz + 1, 1, 'float32'))) + w = 0.5 * (1 + np.cos( + (2 * math.pi / (sz + 2)) * np.arange(0, sz // 2 + 1, 1, 'float32'))) + return np.concatenate([w, np.flip(w[1:sz - sz // 2], 0)]) + + +def hann2d(sz: np.ndarray, centered=True) -> np.ndarray: + """2D cosine window.""" + return np.reshape(hann1d(sz[0], centered), (1, 1, -1, 1)) * \ + np.reshape(hann1d(sz[1], centered), (1, 1, 1, -1)) + + +def hann2d_clipped(sz: np.ndarray, effective_sz: np.ndarray, + centered=True) -> np.ndarray: + """1D clipped cosine window.""" + + # Ensure that the difference is even + effective_sz += (effective_sz - sz) % 2 + effective_window = np.reshape(hann1d(effective_sz[0], True), (1, 1, -1, 1)) * \ + np.reshape(hann1d(effective_sz[1], True), (1, 1, 1, -1)) + + pad = np.int32((sz - effective_sz) / 2) + window = _padding( + effective_window, (pad[1], pad[1], pad[0], pad[0]), mode='replicate') + + if centered: + return window + else: + mid = np.int32((sz / 2)) + window_shift_lr = np.concatenate( + (window[..., mid[1]:], window[..., :mid[1]]), 3) + return np.concatenate((window_shift_lr[..., mid[0]:, :], + window_shift_lr[..., :mid[0], :]), 2) + + +def gauss_fourier(sz: int, sigma: float, half: bool=False) -> np.ndarray: + if half: + k = np.arange(0, int(sz / 2 + 1), 1, 'float32') + else: + k = np.arange(-int((sz - 1) / 2), int(sz / 2 + 1), 1, 'float32') + return (math.sqrt(2 * math.pi) * sigma / sz) * np.exp(-2 * np.square( + math.pi * sigma * k / sz)) + + +def gauss_spatial(sz, sigma, center=0, end_pad=0): + k = np.arange(-(sz - 1) / 2, (sz + 1) / 2 + end_pad, 1, 'float32') + return np.exp(-1.0 / (2 * sigma**2) * np.square(k - center)) + + +def label_function(sz: np.ndarray, sigma: np.ndarray): + return np.reshape(gauss_fourier(sz[0], sigma[0]), (1, 1, -1, 1)) * \ + np.reshape(gauss_fourier(sz[1], sigma[1], True), (1, 1, 1, -1)) + + +def label_function_spatial(sz: np.ndarray, + sigma: np.ndarray, + center: np.ndarray=None, + end_pad: np.ndarray=None): + """The origin is in the middle of the image.""" + if center is None: center = np.zeros((2, ), 'float32') + if end_pad is None: end_pad = np.zeros((2, ), 'float32') + return np.reshape(gauss_spatial(sz[0], sigma[0], center[0], end_pad[0]), (1, 1, -1, 1)) * \ + np.reshape(gauss_spatial(sz[1], sigma[1], center[1], end_pad[1]), (1, 1, 1, -1)) + + +def cubic_spline_fourier(f, a): + """The continuous Fourier transform of a cubic spline kernel.""" + + bf = (6 * (1 - np.cos(2 * math.pi * f)) + 3 * a * (1 - np.cos(4 * math.pi * f)) + - (6 + 8 * a) * math.pi * f * np.sin(2 * math.pi * f) - 2 * a * math.pi * f * np.sin(4 * math.pi * f)) \ + / (4 * math.pi ** 4 * f ** 4) + bf[f == 0] = 1 + return bf + + +def get_interp_fourier(sz: np.ndarray, + method='ideal', + bicubic_param=0.5, + centering=True, + windowing=False, + device='cpu'): + ky, kx = fourier.get_frequency_coord(sz) + + if method == 'ideal': + interp_y = np.ones(ky.shape) / sz[0] + interp_x = np.ones(kx.shape) / sz[1] + elif method == 'bicubic': + interp_y = cubic_spline_fourier(ky / sz[0], bicubic_param) / sz[0] + interp_x = cubic_spline_fourier(kx / sz[1], bicubic_param) / sz[1] + else: + raise ValueError('Unknown method.') + + if centering: + interp_y = complex.mult(interp_y, + complex.exp_imag((-math.pi / sz[0]) * ky)) + interp_x = complex.mult(interp_x, + complex.exp_imag((-math.pi / sz[1]) * kx)) + + if windowing: + raise NotImplementedError + + return interp_y, interp_x + + +def interpolate_dft(a: np.ndarray, interp_fs) -> np.ndarray: + if isinstance(interp_fs, np.ndarray): + return complex.mult(a, interp_fs) + if isinstance(interp_fs, (tuple, list)): + return complex.mult(complex.mult(a, interp_fs[0]), interp_fs[1]) + raise ValueError('"interp_fs" must be tensor or tuple of tensors.') + + +def max2d(a: np.ndarray) -> (np.ndarray, np.ndarray): + """Computes maximum and argmax in the last two dimensions.""" + argmax_row = np.argmax(a, axis=-2) + max_val_row = np.max(a, axis=-2) + argmax_col = np.argmax(max_val_row, axis=-1) + max_val = np.max(max_val_row, axis=-1) + + argmax_row = np.reshape(argmax_row, ( + argmax_col.size, -1))[np.arange(argmax_col.size), argmax_col.flatten()] + argmax_row = argmax_row.reshape(argmax_col.shape) + argmax = np.concatenate( + (np.expand_dims(argmax_row, -1), np.expand_dims(argmax_col, -1)), -1) + + return max_val, argmax diff --git a/PaddleCV/tracking/pytracking/libs/fourier.py b/PaddleCV/tracking/pytracking/libs/fourier.py new file mode 100644 index 0000000000000000000000000000000000000000..d515db34901320cf4f2a20d0ad9b5e9de92dd03e --- /dev/null +++ b/PaddleCV/tracking/pytracking/libs/fourier.py @@ -0,0 +1,163 @@ +import numpy as np + +from pytracking.libs import complex, TensorList +from pytracking.libs.tensorlist import tensor_operation +from pytracking.libs.paddle_utils import _padding + + +@tensor_operation +def rfftshift2(a: np.array): + h = a.shape[2] + 2 + return np.concatenate([a[:, :, (h - 1) // 2:], a[:, :, :h // 2]], 2) + + +@tensor_operation +def irfftshift2(a: np.array): + mid = int((a.shape[2] - 1) / 2) + return np.concatenate([a[:, :, mid:], a[:, :, :mid]], 2) + + +@tensor_operation +def cfft2(a): + """Do FFT and center the low frequency component. + Always produces odd (full) output sizes.""" + out = rfftshift2(np.fft.rfft2(a)) + return np.stack([out.real, out.imag], axis=-1) + + +@tensor_operation +def cifft2(a, signal_sizes=None): + """Do inverse FFT corresponding to cfft2.""" + out = irfftshift2(a) + return np.fft.irfft2(out[..., 0] + 1j * out[..., 1], s=signal_sizes) + + +@tensor_operation +def sample_fs(a: np.array, grid_sz: np.array=None, rescale=True): + """Samples the Fourier series.""" + + # Size of the fourier series + sz = np.array([a.shape[2], 2 * a.shape[3] - 1], 'float32') + + # Default grid + if grid_sz is None or sz[0] == grid_sz[0] and sz[1] == grid_sz[1]: + if rescale: + return np.prod(sz) * cifft2(a) + return cifft2(a) + + if sz[0] > grid_sz[0] or sz[1] > grid_sz[1]: + raise ValueError( + "Only grid sizes that are smaller than the Fourier series size are supported." + ) + + tot_pad = (grid_sz - sz).tolist() + is_even = [s % 2 == 0 for s in sz] + + # Compute paddings + pad_top = int((tot_pad[0] + 1) / 2) if is_even[0] else int(tot_pad[0] / 2) + pad_bottom = int(tot_pad[0] - pad_top) + pad_right = int((tot_pad[1] + 1) / 2) + + if rescale: + return np.prod(grid_sz) * cifft2( + _padding(a, (0, 0, 0, pad_right, pad_top, pad_bottom)), + signal_sizes=grid_sz.astype('long').tolist()) + else: + return cifft2( + _padding(a, (0, 0, 0, pad_right, pad_top, pad_bottom)), + signal_sizes=grid_sz.astype('long').tolist()) + + +def get_frequency_coord(sz, add_complex_dim=False, device='cpu'): + """Frequency coordinates.""" + + ky = np.reshape( + np.arange( + -int((sz[0] - 1) / 2), int(sz[0] / 2 + 1), dtype='float32'), + (1, 1, -1, 1)) + kx = np.reshape( + np.arange( + 0, int(sz[1] / 2 + 1), dtype='float32'), (1, 1, 1, -1)) + + if add_complex_dim: + ky = np.expand_dims(ky, -1) + kx = np.expand_dims(kx, -1) + + return ky, kx + + +@tensor_operation +def shift_fs(a: np.array, shift: np.array): + """Shift a sample a in the Fourier domain. + Params: + a : The fourier coefficiens of the sample. + shift : The shift to be performed normalized to the range [-pi, pi].""" + + if a.ndim != 5: + raise ValueError( + 'a must be the Fourier coefficients, a 5-dimensional tensor.') + + if shift[0] == 0 and shift[1] == 0: + return a + + ky, kx = get_frequency_coord((a.shape[2], 2 * a.shape[3] - 1)) + + return complex.mult( + complex.mult(a, complex.exp_imag(shift[0] * ky)), + complex.exp_imag(shift[1] * kx)) + + +def sum_fs(a: TensorList) -> np.array: + """Sum a list of Fourier series expansions.""" + + s = None + mid = None + + for e in sorted(a, key=lambda elem: elem.shape[-3], reverse=True): + if s is None: + s = e.copy() + mid = int((s.shape[-3] - 1) / 2) + else: + # Compute coordinates + top = mid - int((e.shape[-3] - 1) / 2) + bottom = mid + int(e.shape[-3] / 2) + 1 + right = e.shape[-2] + + # Add the data + s[..., top:bottom, :right, :] += e + + return s + + +def sum_fs12(a: TensorList) -> np.array: + """Sum a list of Fourier series expansions.""" + + s = None + mid = None + + for e in sorted(a, key=lambda elem: elem.shape[0], reverse=True): + if s is None: + s = e.copy() + mid = int((s.shape[0] - 1) / 2) + else: + # Compute coordinates + top = mid - int((e.shape[0] - 1) / 2) + bottom = mid + int(e.shape[0] / 2) + 1 + right = e.shape[1] + + # Add the data + s[top:bottom, :right, ...] += e + + return s + + +@tensor_operation +def inner_prod_fs(a: np.array, b: np.array): + if complex.is_complex(a) and complex.is_complex(b): + return 2 * (a.flatten() @b.flatten() + ) - a[:, :, :, 0, :].flatten() @b[:, :, :, 0, :].flatten() + elif complex.is_real(a) and complex.is_real(b): + return 2 * (a.flatten() @b.flatten() + ) - a[:, :, :, 0].flatten() @b[:, :, :, 0].flatten() + else: + raise NotImplementedError('Not implemented for mixed real and complex.') diff --git a/PaddleCV/tracking/pytracking/libs/operation.py b/PaddleCV/tracking/pytracking/libs/operation.py new file mode 100644 index 0000000000000000000000000000000000000000..62e5250a20a4124d05aed571cc8fda6f2100e3a2 --- /dev/null +++ b/PaddleCV/tracking/pytracking/libs/operation.py @@ -0,0 +1,59 @@ +from paddle import fluid +from paddle.fluid import layers +from pytracking.libs.Fconv2d import Fconv2d +from pytracking.libs.tensorlist import tensor_operation, TensorList +from paddle.fluid.framework import Variable as PTensor + + +@tensor_operation +def conv2d(input: PTensor, + weight: PTensor, + bias: PTensor=None, + stride=1, + padding=0, + dilation=1, + groups=1, + mode=None): + """Standard conv2d. Returns the input if weight=None.""" + + if weight is None: + return input + + ind = None + if mode is not None: + if padding != 0: + raise ValueError('Cannot input both padding and mode.') + if mode == 'same': + padding = (weight.shape[2] // 2, weight.shape[3] // 2) + if weight.shape[2] % 2 == 0 or weight.shape[3] % 2 == 0: + ind = (slice(-1) + if weight.shape[2] % 2 == 0 else slice(None), slice(-1) + if weight.shape[3] % 2 == 0 else slice(None)) + elif mode == 'valid': + padding = (0, 0) + elif mode == 'full': + padding = (weight.shape[2] - 1, weight.shape[3] - 1) + else: + raise ValueError('Unknown mode for padding.') + + assert bias is None + out = Fconv2d( + input, + weight, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups) + if ind is None: + return out + return out[:, :, ind[0], ind[1]] + + +@tensor_operation +def conv1x1(input: PTensor, weight: PTensor): + """Do a convolution with a 1x1 kernel weights. Implemented with matmul, which can be faster than using conv.""" + + if weight is None: + return input + + return Fconv2d(input, weight) diff --git a/PaddleCV/tracking/pytracking/libs/optimization.py b/PaddleCV/tracking/pytracking/libs/optimization.py new file mode 100644 index 0000000000000000000000000000000000000000..41236ad8c9f26451903020da77a5b3910345cd62 --- /dev/null +++ b/PaddleCV/tracking/pytracking/libs/optimization.py @@ -0,0 +1,779 @@ +import numpy as np +from paddle.fluid import layers +from paddle import fluid +from pytracking.libs.tensorlist import TensorList +from pytracking.utils.plotting import plot_graph +from pytracking.libs.paddle_utils import n2p, clone, static_clone + + +class L2Problem: + """Base class for representing an L2 optimization problem.""" + + def __call__(self, x: TensorList) -> TensorList: + """Shall compute the residuals of the problem.""" + raise NotImplementedError + + def ip_input(self, a, b): + """Inner product of the input space.""" + return sum(a.view(-1) @b.view(-1)) + + def ip_output(self, a, b): + """Inner product of the output space.""" + return sum(a.view(-1) @b.view(-1)) + + def M1(self, x): + """M1 preconditioner.""" + return x + + def M2(self, x): + """M2 preconditioner.""" + return x + + def get_feed_dict(self): + raise NotImplementedError + + +class MinimizationProblem: + """General minimization problem.""" + + def __call__(self, x: TensorList) -> TensorList: + """Shall compute the loss.""" + raise NotImplementedError + + def ip_input(self, a, b): + """Inner product of the input space.""" + return sum(a.view(-1) @b.view(-1)) + + def M1(self, x): + return x + + def M2(self, x): + return x + + def get_feed_dict(self): + raise NotImplementedError + + +class ConjugateGradientBase: + """Conjugate Gradient optimizer base class. Implements the CG loop.""" + + def __init__(self, + fletcher_reeves=True, + standard_alpha=True, + direction_forget_factor=0, + debug=False): + self.fletcher_reeves = fletcher_reeves + self.standard_alpha = standard_alpha + self.direction_forget_factor = direction_forget_factor + self.debug = debug + + # State + self.p = None + self.rho = np.ones((1, ), 'float32') + self.r_prev = None + + # Right hand side + self.b = None + + def reset_state(self): + self.p = None + self.rho = np.ones((1, ), 'float32') + self.r_prev = None + + def run_CG(self, num_iter, x=None, eps=0.0): + """Main conjugate gradient method. + + args: + num_iter: Number of iterations. + x: Initial guess. Assumed zero if None. + eps: Stop if the residual norm gets smaller than this. + """ + + # Apply forgetting factor + if self.direction_forget_factor == 0: + self.reset_state() + elif self.p is not None: + self.rho /= self.direction_forget_factor + + if x is None: + r = self.b.clone() + else: + r = self.b - self.A(x) + + # Norms of residuals etc for debugging + resvec = None + + # Loop over iterations + for ii in range(num_iter): + # Preconditioners + y = self.M1(r) + z = self.M2(y) + + rho1 = self.rho + self.rho = self.ip(r, z) + + if self.check_zero(self.rho): + if self.debug: + print('Stopped CG since rho = 0') + if resvec is not None: + resvec = resvec[:ii + 1] + return x, resvec + + if self.p is None: + self.p = z.clone() + else: + if self.fletcher_reeves: + beta = self.rho / rho1 + else: + rho2 = self.ip(self.r_prev, z) + beta = (self.rho - rho2) / rho1 + + beta = beta.apply(lambda a: np.clip(a, 0, 1e10)) + self.p = z + self.p * beta + + q = self.A(self.p) + pq = self.ip(self.p, q) + + if self.standard_alpha: + alpha = self.rho / pq + else: + alpha = self.ip(self.p, r) / pq + + # Save old r for PR formula + if not self.fletcher_reeves: + self.r_prev = r.clone() + + # Form new iterate + if x is None: + x = self.p * alpha + else: + x += self.p * alpha + + if ii < num_iter - 1 or self.debug: + r -= q * alpha + + if eps > 0.0 or self.debug: + normr = self.residual_norm(r) + + # if self.debug: + if True: + self.evaluate_CG_iteration(x) + # resvec[ii + 1] = normr + + if eps > 0 and normr <= eps: + if self.debug: + print('Stopped CG since norm smaller than eps') + break + + if resvec is not None: + resvec = resvec[:ii + 2] + + return x, resvec + + def A(self, x): + # Implements the left hand operation + raise NotImplementedError + + def ip(self, a, b): + # Implements the inner product + return a.view(-1) @b.view(-1) + + def residual_norm(self, r): + res = self.ip(r, r).sum() + if isinstance(res, (TensorList, list, tuple)): + res = sum(res) + return np.sqrt(res) + + def check_zero(self, s, eps=0.0): + ss = s.abs() <= eps + if isinstance(ss, (TensorList, list, tuple)): + ss = sum(ss) + return ss > 0 + + def M1(self, x): + # M1 preconditioner + return x + + def M2(self, x): + # M2 preconditioner + return x + + def evaluate_CG_iteration(self, x): + pass + + +class ConjugateGradient(ConjugateGradientBase): + """Conjugate Gradient optimizer, performing single linearization of the residuals in the start.""" + + def __init__(self, + problem: L2Problem, + variable: TensorList, + cg_eps=0.0, + fletcher_reeves=True, + standard_alpha=True, + direction_forget_factor=0, + debug=False, + analyze=False, + plotting=False, + fig_num=(10, 11)): + super().__init__(fletcher_reeves, standard_alpha, + direction_forget_factor, debug or plotting) + + self.problem = problem + self.x = variable + + self.plotting = plotting + self.fig_num = fig_num + + self.cg_eps = cg_eps + self.f0 = None + self.g = None + self.dfdxt_g = None + + self.residuals = np.zeros(0) + self.losses = np.zeros(0) + self._construct_graph() + self.analyze_convergence = analyze + + def clear_temp(self): + pass + + def _construct_graph(self): + train_program = fluid.Program() + start_program = fluid.Program() + with fluid.program_guard(train_program, start_program): + scope = 'first/' + self.x_ph = TensorList([ + fluid.layers.data( + '{}x_{}'.format(scope, idx), + v.shape, + append_batch_size=False, + stop_gradient=False) for idx, v in enumerate(self.x) + ]) + self.p_ph = TensorList([ + fluid.layers.data( + '{}p_{}'.format(scope, idx), + v.shape, + append_batch_size=False, + stop_gradient=False) for idx, v in enumerate(self.x) + ]) + + # problem forward + self.f0 = self.problem(self.x_ph, scope) + + self.g = self.f0.apply(static_clone) + # self.g = self.f0 + + # Get df/dx^t @ f0 + self.dfdxt_g = TensorList( + fluid.gradients(self.f0, self.x_ph, self.g)) + + # For computing A + tmp = [a * b for a, b in zip(self.dfdxt_g, self.p_ph)] + self.dfdx_x = TensorList(fluid.gradients(tmp, self.g)) + # self.dfdx_x = TensorList(fluid.gradients(self.dfdxt_g, self.g, self.p_ph)) + + train_program2 = fluid.Program() + start_program2 = fluid.Program() + with fluid.program_guard(train_program2, start_program2): + scope = 'second/' + self.x_ph_2 = TensorList([ + fluid.layers.data( + '{}x_{}'.format(scope, idx), + v.shape, + append_batch_size=False, + stop_gradient=False) for idx, v in enumerate(self.x) + ]) + self.dfdx_x_ph = TensorList([ + fluid.layers.data( + '{}dfdx_x_{}'.format(scope, idx), + v.shape, + append_batch_size=False, + stop_gradient=False) for idx, v in enumerate(self.g) + ]) + + self.f0_2 = self.problem(self.x_ph_2, scope) + self.dfdx_dfdx = TensorList( + fluid.gradients(self.f0_2 * self.dfdx_x_ph, self.x_ph_2)) + + place = fluid.CUDAPlace(0) + self.exe = fluid.Executor(place) + self.exe.run(program=fluid.default_startup_program()) + self.compiled_prog = fluid.compiler.CompiledProgram(train_program) + + place2 = fluid.CUDAPlace(0) + self.exe2 = fluid.Executor(place2) + self.exe2.run(program=fluid.default_startup_program()) + self.compiled_prog2 = fluid.compiler.CompiledProgram(train_program2) + + def get_dfdxt_g(self): + scope = 'first/' + feed_dict = self.problem.get_feed_dict(scope) + # add variable feed + for idx, v in enumerate(self.x): + feed_dict['{}x_{}'.format(scope, idx)] = v + for idx, v in enumerate(self.x): + feed_dict['{}p_{}'.format(scope, idx)] = v + res = self.exe.run(self.compiled_prog, + feed=feed_dict, + fetch_list=[v.name for v in self.dfdxt_g]) + return TensorList(res) + + def run(self, num_cg_iter): + """Run the oprimizer with the provided number of iterations.""" + + if num_cg_iter == 0: + return + + # Get the right hand side + self.b = -self.get_dfdxt_g() + + self.evaluate_CG_iteration(0) + + # Run CG + delta_x, res = self.run_CG(num_cg_iter, eps=self.cg_eps) + + self.x += delta_x + + # reset problem training samples + self.problem.training_samples_stack = None + + def A(self, x): + # First pass + scope = 'first/' + feed_dict = self.problem.get_feed_dict(scope) + # add variable feed + for idx, v in enumerate(self.x): + feed_dict['{}x_{}'.format(scope, idx)] = v + # add p feed + for idx, v in enumerate(x): + feed_dict['{}p_{}'.format(scope, idx)] = v + + dfdx_x = TensorList( + self.exe.run(self.compiled_prog, + feed=feed_dict, + fetch_list=[v.name for v in self.dfdx_x])) + + # Second pass + scope = 'second/' + feed_dict = self.problem.get_feed_dict(scope) + # add variable feed + for idx, v in enumerate(self.x): + feed_dict['{}x_{}'.format(scope, idx)] = v + # add p feed + for idx, v in enumerate(dfdx_x): + feed_dict['{}dfdx_x_{}'.format(scope, idx)] = v + + res = TensorList( + self.exe2.run(self.compiled_prog2, + feed=feed_dict, + fetch_list=[v.name for v in self.dfdx_dfdx])) + + return res + + def ip(self, a, b): + return self.problem.ip_input(a, b) + + def M1(self, x): + return self.problem.M1(x) + + def M2(self, x): + return self.problem.M2(x) + + def evaluate_CG_iteration(self, delta_x): + if self.analyze_convergence: + scope = 'first/' + x = self.x + delta_x + feed_dict = self.problem.get_feed_dict(scope) + for idx, v in enumerate(x): + feed_dict['{}x_{}'.format(scope, idx)] = v + for idx, v in enumerate(x): + feed_dict['{}p_{}'.format(scope, idx)] = v + res = self.exe.run(self.compiled_prog, + feed=feed_dict, + fetch_list=[v.name for v in self.f0]) + res = TensorList(res) + loss = self.problem.ip_output(res, res) + #print('Paddle Loss: {}'.format(loss)) + + +class GaussNewtonCG(ConjugateGradientBase): + """Gauss-Newton with Conjugate Gradient optimizer.""" + + def __init__(self, + problem: L2Problem, + variable: TensorList, + cg_eps=0.0, + fletcher_reeves=True, + standard_alpha=True, + direction_forget_factor=0, + debug=False, + analyze=False, + plotting=False, + fig_num=(10, 11, 12)): + super().__init__(fletcher_reeves, standard_alpha, + direction_forget_factor, debug or analyze or plotting) + + self.problem = problem + self.x = variable + + self.analyze_convergence = analyze + self.plotting = plotting + self.fig_num = fig_num + + self.cg_eps = cg_eps + self.f0 = None + self.g = None + self.dfdxt_g = None + + self.residuals = np.zeros(0) + self.losses = np.zeros(0) + self.gradient_mags = np.zeros(0) + self._construct_graph() + + def clear_temp(self): + self.f0 = None + self.g = None + self.dfdxt_g = None + + def run_GN(self, *args, **kwargs): + return self.run(*args, **kwargs) + + def _construct_graph(self): + train_program = fluid.Program() + start_program = fluid.Program() + with fluid.program_guard(train_program, start_program): + scope = 'first/' + self.x_ph = TensorList([ + fluid.layers.data( + '{}x_{}'.format(scope, idx), + v.shape, + append_batch_size=False, + stop_gradient=False) for idx, v in enumerate(self.x) + ]) + self.p_ph = TensorList([ + fluid.layers.data( + '{}p_{}'.format(scope, idx), + v.shape, + append_batch_size=False, + stop_gradient=False) for idx, v in enumerate(self.x) + ]) + + # problem forward + self.f0 = self.problem(self.x_ph, scope) + + self.g = self.f0.apply(static_clone) + + # Get df/dx^t @ f0 + self.dfdxt_g = TensorList( + fluid.gradients(self.f0, self.x_ph, self.g)) + + # For computing A + tmp = [a * b for a, b in zip(self.dfdxt_g, self.p_ph)] + self.dfdx_x = TensorList(fluid.gradients(tmp, self.g)) + # self.dfdx_x = TensorList(fluid.gradients(self.dfdxt_g, self.g, self.p_ph)) + + train_program2 = fluid.Program() + start_program2 = fluid.Program() + with fluid.program_guard(train_program2, start_program2): + scope = 'second/' + self.x_ph_2 = TensorList([ + fluid.layers.data( + '{}x_{}'.format(scope, idx), + v.shape, + append_batch_size=False, + stop_gradient=False) for idx, v in enumerate(self.x) + ]) + self.dfdx_x_ph = TensorList([ + fluid.layers.data( + '{}dfdx_x_{}'.format(scope, idx), + v.shape, + append_batch_size=False, + stop_gradient=False) for idx, v in enumerate(self.g) + ]) + + self.f0_2 = self.problem(self.x_ph_2, scope) + self.dfdx_dfdx = TensorList( + fluid.gradients(self.f0_2 * self.dfdx_x_ph, self.x_ph_2)) + + place = fluid.CUDAPlace(0) + self.exe = fluid.Executor(place) + self.exe.run(program=fluid.default_startup_program()) + self.compiled_prog = fluid.compiler.CompiledProgram(train_program) + + place2 = fluid.CUDAPlace(0) + self.exe2 = fluid.Executor(place2) + self.exe2.run(program=fluid.default_startup_program()) + self.compiled_prog2 = fluid.compiler.CompiledProgram(train_program2) + + def get_dfdxt_g(self): + scope = 'first/' + feed_dict = self.problem.get_feed_dict(scope) + # add variable feed + for idx, v in enumerate(self.x): + feed_dict['{}x_{}'.format(scope, idx)] = v + for idx, v in enumerate(self.x): + feed_dict['{}p_{}'.format(scope, idx)] = v + res = self.exe.run(self.compiled_prog, + feed=feed_dict, + fetch_list=[v.name for v in self.dfdxt_g]) + return TensorList(res) + + def run(self, num_cg_iter, num_gn_iter=None): + """Run the optimizer. + args: + num_cg_iter: Number of CG iterations per GN iter. If list, then each entry specifies number of CG iterations + and number of GN iterations is given by the length of the list. + num_gn_iter: Number of GN iterations. Shall only be given if num_cg_iter is an integer. + """ + + if isinstance(num_cg_iter, int): + if num_gn_iter is None: + raise ValueError( + 'Must specify number of GN iter if CG iter is constant') + num_cg_iter = [num_cg_iter] * num_gn_iter + + num_gn_iter = len(num_cg_iter) + if num_gn_iter == 0: + return + + if self.analyze_convergence: + self.evaluate_CG_iteration(0) + + # Outer loop for running the GN iterations. + for cg_iter in num_cg_iter: + self.run_GN_iter(cg_iter) + + # reset problem training samples + self.problem.training_samples_stack = None + return self.losses, self.residuals + + def run_GN_iter(self, num_cg_iter): + """Runs a single GN iteration.""" + + self.b = -self.get_dfdxt_g() + + # Run CG + if num_cg_iter > 0: + delta_x, res = self.run_CG(num_cg_iter, eps=self.cg_eps) + self.x += delta_x + + def A(self, x): + # First pass + scope = 'first/' + feed_dict = self.problem.get_feed_dict(scope) + # add variable feed + for idx, v in enumerate(self.x): + feed_dict['{}x_{}'.format(scope, idx)] = v + # add p feed + for idx, v in enumerate(x): + feed_dict['{}p_{}'.format(scope, idx)] = v + + dfdx_x = TensorList( + self.exe.run(self.compiled_prog, + feed=feed_dict, + fetch_list=[v.name for v in self.dfdx_x])) + + # Second pass + scope = 'second/' + feed_dict = self.problem.get_feed_dict(scope) + # add variable feed + for idx, v in enumerate(self.x): + feed_dict['{}x_{}'.format(scope, idx)] = v + # add p feed + for idx, v in enumerate(dfdx_x): + feed_dict['{}dfdx_x_{}'.format(scope, idx)] = v + + res = TensorList( + self.exe2.run(self.compiled_prog2, + feed=feed_dict, + fetch_list=[v.name for v in self.dfdx_dfdx])) + + return res + + def ip(self, a, b): + return self.problem.ip_input(a, b) + + def M1(self, x): + return self.problem.M1(x) + + def M2(self, x): + return self.problem.M2(x) + + def evaluate_CG_iteration(self, delta_x): + if self.analyze_convergence: + scope = 'first/' + x = self.x + delta_x + feed_dict = self.problem.get_feed_dict(scope) + for idx, v in enumerate(x): + feed_dict['{}x_{}'.format(scope, idx)] = v + for idx, v in enumerate(x): + feed_dict['{}p_{}'.format(scope, idx)] = v + res = self.exe.run(self.compiled_prog, + feed=feed_dict, + fetch_list=[v.name for v in self.f0]) + res = TensorList(res) + loss = self.problem.ip_output(res, res) + #print('Paddle Loss: {}'.format(loss)) + + +class GradientDescentL2: + """Gradient descent with momentum for L2 problems.""" + + def __init__(self, + problem: L2Problem, + variable: TensorList, + step_length: float, + momentum: float=0.0, + debug=False, + plotting=False, + fig_num=(10, 11)): + + self.problem = problem + self.x = variable # Numpy arrays + + self.step_legnth = step_length + self.momentum = momentum + + self.debug = debug or plotting + self.plotting = plotting + self.fig_num = fig_num + + self.losses = np.zeros(0) + self.gradient_mags = np.zeros(0) + self.residuals = None + + self.clear_temp() + self._construct_graph() + + def clear_temp(self): + self.f0 = None + self.dir = None + + def _construct_graph(self): + train_program = fluid.Program() + start_program = fluid.Program() + with fluid.program_guard(train_program, start_program): + self.x_ph = TensorList([ + fluid.layers.data( + 'x_{}'.format(idx), + v.shape, + append_batch_size=False, + stop_gradient=False) for idx, v in enumerate(self.x) + ]) + + # problem forward + self.f0 = self.problem(self.x_ph) + self.loss = self.problem.ip_output(self.f0, self.f0) + # problem backward + self.grad = TensorList(fluid.gradients(self.loss, self.x_ph)) + + place = fluid.CUDAPlace(0) + self.exe = fluid.Executor(place) + self.exe.run(program=fluid.default_startup_program()) + self.compiled_prog = fluid.compiler.CompiledProgram(train_program) + + def get_feed_dict(self, x_list): + feed_dict = self.problem.get_feed_dict() + # add variable feed + for idx, v in enumerate(x_list): + feed_dict['x_{}'.format(idx)] = v + return feed_dict + + def run(self, num_iter, dummy=None): + if num_iter == 0: + return + + grad_names = [v.name for v in self.grad] + for i in range(num_iter): + res = self.exe.run(self.compiled_prog, + feed=self.get_feed_dict(self.x), + fetch_list=[self.loss.name] + grad_names) + if self.debug: + loss = res[0] + #print('Paddle Loss: {}'.format(loss)) + + grad = TensorList(res[1:]) + + # update parameters + if self.dir is None: + self.dir = grad + else: + self.dir = grad + self.momentum * self.dir + self.x = self.x - self.step_legnth * self.dir + + # reset problem training samples + self.problem.training_samples_stack = None + + +class GradientDescent: + """Gradient descent for general minimization problems.""" + + def __init__(self, + problem: MinimizationProblem, + variable: TensorList, + step_length: float, + momentum: float=0.0, + debug=False, + plotting=False, + fig_num=(10, 11)): + + self.problem = problem + self.x = variable + + self.step_legnth = step_length + self.momentum = momentum + + self.debug = debug or plotting + self.plotting = plotting + self.fig_num = fig_num + + self.losses = layers.zeros((0, ), 'float32') + self.gradient_mags = layers.zeros((0, ), 'float32') + self.residuals = None + + self.clear_temp() + + def clear_temp(self): + self.dir = None + + def run(self, num_iter, dummy=None): + + if num_iter == 0: + return + + lossvec = None + if self.debug: + lossvec = np.zeros((num_iter + 1, )) + grad_mags = np.zeros((num_iter + 1, )) + + for i in range(num_iter): + self.x.stop_gradient = False + + # Evaluate function at current estimate + loss = self.problem(self.x) + + # Compute grad + loss.backward() + grad = TensorList(self.x.gradient()).apply(n2p) + self.x.clear_gradient() + + # Update direction + if self.dir is None: + self.dir = grad + else: + self.dir = grad + self.momentum * self.dir + + self.x = self.x.detach() + self.x -= self.step_legnth * self.dir + + if self.debug: + lossvec[i] = loss.numpy() + grad_mags[i] = self.problem.ip_input( + grad, grad).apply(layers.sqrt).numpy() + + self.problem.training_samples_stack = None + + self.x = self.x.detach() + self.x.stop_gradient = True + self.clear_temp() diff --git a/PaddleCV/tracking/pytracking/libs/paddle_utils.py b/PaddleCV/tracking/pytracking/libs/paddle_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e5fb99d2df6d571cc23525df5dd25a5c23b1bfe0 --- /dev/null +++ b/PaddleCV/tracking/pytracking/libs/paddle_utils.py @@ -0,0 +1,218 @@ +import numpy as np +import paddle +from paddle.fluid import dygraph +from paddle.fluid import layers +from paddle.fluid.framework import Variable +import cv2 as cv +PTensor = Variable + + +def broadcast_op(a, b, op='mul'): + a_expand_factors = [] + b_expand_factors = [] + assert len(a.shape) == len( + b.shape), 'a.shape = {} while b.shape = {}'.format(a.shape, b.shape) + for a_s, b_s in zip(a.shape, b.shape): + if a_s != b_s: + if a_s == 1: + a_expand_factors.append(b_s) + b_expand_factors.append(1) + elif b_s == 1: + a_expand_factors.append(1) + b_expand_factors.append(a_s) + else: + raise NotImplementedError + else: + a_expand_factors.append(1) + b_expand_factors.append(1) + if op == 'mul': + op = layers.elementwise_mul + elif op == 'add': + op = layers.elementwise_add + elif op == 'sub': + op = layers.elementwise_sub + elif op == 'div': + op = layers.elementwise_div + else: + raise NotImplementedError + return op( + layers.expand(a, a_expand_factors), layers.expand(b, b_expand_factors)) + + +def paddle_prod(x): + prod = 1 + num_elems = x.shape[0] + for idx in range(num_elems): + prod *= x[idx] + return prod + + +def n2p(x, dtype=None): + if dtype is None: + x = np.array(x) + if x.dtype == np.float64: + x = x.astype('float32') + else: + x = np.array(x, dtype=dtype) + return dygraph.to_variable(x) + + +def p2n(x): + return x.numpy() + + +def clone(x): + v = dygraph.to_variable(x.numpy()) + v.stop_gradient = x.stop_gradient + return v + + +def static_identity(x): + x = layers.reshape(x, x.shape) + return x + + +def static_clone(x): + x1 = static_identity(x) + x1.stop_gradient = True + x2 = static_identity(x1) + x2.stop_gradient = x.stop_gradient + return x2 + + +def detach(x): + v = dygraph.to_variable(x.numpy()) + v.stop_gradient = True + return v + + +def squeeze(input, axes): + new_shape = [] + for i, s in enumerate(input.shape): + if i in axes: + assert s == 1 + else: + new_shape.append(s) + return layers.reshape(input, new_shape) + + +def unsqueeze(input, axes): + new_shape = [] + for i, s in enumerate(input.shape): + for a in axes: + if i == a: + new_shape.append(1) + new_shape.append(s) + return layers.reshape(input, new_shape) + + +def crop(x, crops): + slices = [] + for c in crops: + c1 = None if c[1] == 0 else -c[1] + slices.append(slice(c[0], c1)) + return x[tuple(slices)] + + +def _padding(x, pads, mode='constant'): + return_tensor = False + if isinstance(x, PTensor): + x = x.numpy() + return_tensor = True + + assert len(pads) % 2 == 0 + pads = list(pads) + [0] * (len(x.shape) * 2 - len(pads)) + + # convert to numpy pad format + pads_np, pad_per_dim = [], [] + for i, p in enumerate(pads): + if i % 2 == 0: + pad_per_dim = [p] + else: + pad_per_dim.append(p) + pads_np.insert(0, pad_per_dim) + + # handle negative pads (cropping) + pads_np_pos, pads_np_neg = [], [] + for pad_per_dim in pads_np: + pad_per_dim_pos, pad_per_dim_neg = [], [] + for p in pad_per_dim: + if p < 0: + pad_per_dim_pos.append(0) + pad_per_dim_neg.append(-p) + else: + pad_per_dim_pos.append(p) + pad_per_dim_neg.append(0) + pads_np_pos.append(pad_per_dim_pos) + pads_np_neg.append(pad_per_dim_neg) + + # cropping + x = crop(x, pads_np_neg) + + # padding + # if x is an image + if len(x.shape) == 3 and pads_np_pos[-1][0] == 0 and pads_np_pos[-1][ + 1] == 0: + if mode == 'replicate': + pad_mode = cv.BORDER_REPLICATE + else: + pad_mode = cv.BORDER_CONSTANT + y1_pad, y2_pad = pads_np_pos[0] + x1_pad, x2_pad = pads_np_pos[1] + x = cv.copyMakeBorder(x, y1_pad, y2_pad, x1_pad, x2_pad, pad_mode) + else: + np_mode = 'edge' if mode == 'replicate' else 'constant' + x = np.pad(x, pads_np_pos, mode=np_mode) + + out = dygraph.to_variable(x) if return_tensor else x + return out + + +def mod(a, b): + arg_list, new_arg_list = [a, b], [] + return_PTensor = False + for x in arg_list: + if isinstance(x, PTensor): + x = p2n(x) + return_PTensor = True + new_arg_list.append(x) + + out = new_arg_list[0] % new_arg_list[1] + return n2p(out) if return_PTensor else out + + +def floordiv(a, b): + arg_list, new_arg_list = [a, b], [] + return_PTensor = False + for x in arg_list: + if isinstance(x, PTensor): + x = p2n(x) + return_PTensor = True + new_arg_list.append(x) + + out = new_arg_list[0] // new_arg_list[1] + return n2p(out) if return_PTensor else out + + +def stack_sum(x): + return layers.reduce_sum(layers.stack(x)) + + +def leaky_relu(x, alpha): + return layers.relu(x) + alpha * (-1 * layers.relu(-1 * x)) + + +def elu(x, alpha): + return layers.relu(x) + alpha * (layers.exp(-1 * layers.relu(-1 * x)) - 1) + + +def dropout2d(input, prob, is_train=False): + if not is_train: + return input + channels = input.shape[1] + keep_prob = 1.0 - prob + random_tensor = keep_prob + layers.uniform_random_batch_size_like( + input, [-1, channels, 1, 1], min=0., max=1.) + binary_tensor = layers.floor(random_tensor) + output = input / keep_prob * binary_tensor + return output diff --git a/PaddleCV/tracking/pytracking/libs/tensordict.py b/PaddleCV/tracking/pytracking/libs/tensordict.py new file mode 100644 index 0000000000000000000000000000000000000000..10325dd64b2170b09baf7f13cd204c61706d63ae --- /dev/null +++ b/PaddleCV/tracking/pytracking/libs/tensordict.py @@ -0,0 +1,36 @@ +from collections import OrderedDict + + +class TensorDict(OrderedDict): + """Container mainly used for dicts of Variable.""" + + def concat(self, other): + """Concatenates two dicts without copying internal data.""" + return TensorDict(self, **other) + + def copy(self): + return TensorDict(super(TensorDict, self).copy()) + + def __getattr__(self, name): + for n, e in self.items(): + if not hasattr(e, name): + raise AttributeError('\'{}\' object has not attribute \'{}\''. + format(type(e), name)) + + def apply_attr(*args, **kwargs): + return TensorDict({ + n: getattr(e, name)(*args, **kwargs) if hasattr(e, name) else e + for n, e in self.items() + }) + + return apply_attr + + def attribute(self, attr: str, *args): + return TensorDict({n: getattr(e, attr, *args) for n, e in self.items()}) + + def apply(self, fn, *args, **kwargs): + return TensorDict({n: fn(e, *args, **kwargs) for n, e in self.items()}) + + @staticmethod + def _iterable(a): + return isinstance(a, (TensorDict, list)) diff --git a/PaddleCV/tracking/pytracking/libs/tensorlist.py b/PaddleCV/tracking/pytracking/libs/tensorlist.py new file mode 100644 index 0000000000000000000000000000000000000000..24d6e3d2a9488e7c75286233e5c10929fa339f35 --- /dev/null +++ b/PaddleCV/tracking/pytracking/libs/tensorlist.py @@ -0,0 +1,268 @@ +import functools +import numpy as np +from paddle.fluid import layers + +from pytracking.libs.paddle_utils import clone as clone_fn +from pytracking.libs.paddle_utils import detach as detach_fn +from pytracking.libs.paddle_utils import PTensor + + +def matmul(a, b): + if isinstance(a, PTensor) or isinstance(b, PTensor): + return layers.matmul(a, b) + else: + return np.matmul(a, b) + + +class TensorList(list): + """Container mainly used for lists of paddle tensors. Extends lists with paddle functionality.""" + + def __init__(self, list_of_tensors=list()): + super(TensorList, self).__init__(list_of_tensors) + + def __getitem__(self, item): + if isinstance(item, int): + return super(TensorList, self).__getitem__(item) + elif isinstance(item, (tuple, list)): + return TensorList( + [super(TensorList, self).__getitem__(i) for i in item]) + else: + return TensorList(super(TensorList, self).__getitem__(item)) + + def __add__(self, other): + if TensorList._iterable(other): + return TensorList([e1 + e2 for e1, e2 in zip(self, other)]) + return TensorList([e + other for e in self]) + + def __radd__(self, other): + if TensorList._iterable(other): + return TensorList([e2 + e1 for e1, e2 in zip(self, other)]) + return TensorList([other + e for e in self]) + + def __iadd__(self, other): + if TensorList._iterable(other): + for i, e2 in enumerate(other): + self[i] += e2 + else: + for i in range(len(self)): + self[i] += other + return self + + def __sub__(self, other): + if TensorList._iterable(other): + return TensorList([e1 - e2 for e1, e2 in zip(self, other)]) + return TensorList([e - other for e in self]) + + def __rsub__(self, other): + if TensorList._iterable(other): + return TensorList([e2 - e1 for e1, e2 in zip(self, other)]) + return TensorList([other - e for e in self]) + + def __isub__(self, other): + if TensorList._iterable(other): + for i, e2 in enumerate(other): + self[i] -= e2 + else: + for i in range(len(self)): + self[i] -= other + return self + + def __mul__(self, other): + if TensorList._iterable(other): + return TensorList([e1 * e2 for e1, e2 in zip(self, other)]) + return TensorList([e * other for e in self]) + + def __rmul__(self, other): + if TensorList._iterable(other): + return TensorList([e2 * e1 for e1, e2 in zip(self, other)]) + return TensorList([other * e for e in self]) + + def __imul__(self, other): + if TensorList._iterable(other): + for i, e2 in enumerate(other): + self[i] *= e2 + else: + for i in range(len(self)): + self[i] *= other + return self + + def __truediv__(self, other): + if TensorList._iterable(other): + return TensorList([e1 / e2 for e1, e2 in zip(self, other)]) + return TensorList([e / other for e in self]) + + def __rtruediv__(self, other): + if TensorList._iterable(other): + return TensorList([e2 / e1 for e1, e2 in zip(self, other)]) + return TensorList([other / e for e in self]) + + def __itruediv__(self, other): + if TensorList._iterable(other): + for i, e2 in enumerate(other): + self[i] /= e2 + else: + for i in range(len(self)): + self[i] /= other + return self + + def __matmul__(self, other): + if TensorList._iterable(other): + return TensorList([matmul(e1, e2) for e1, e2 in zip(self, other)]) + return TensorList([matmul(e, other) for e in self]) + + def __rmatmul__(self, other): + if TensorList._iterable(other): + return TensorList([matmul(e2, e1) for e1, e2 in zip(self, other)]) + return TensorList([matmul(other, e) for e in self]) + + def __imatmul__(self, other): + if TensorList._iterable(other): + for i, e2 in enumerate(other): + self[i] = matmul(self[i], e2) + else: + for i in range(len(self)): + self[i] = matmul(self[i], other) + return self + + def __mod__(self, other): + if TensorList._iterable(other): + return TensorList([e1 % e2 for e1, e2 in zip(self, other)]) + return TensorList([e % other for e in self]) + + def __rmod__(self, other): + if TensorList._iterable(other): + return TensorList([e2 % e1 for e1, e2 in zip(self, other)]) + return TensorList([other % e for e in self]) + + def __pos__(self): + return TensorList([+e for e in self]) + + def __neg__(self): + return TensorList([-e for e in self]) + + def __le__(self, other): + if TensorList._iterable(other): + return TensorList([e1 <= e2 for e1, e2 in zip(self, other)]) + return TensorList([e <= other for e in self]) + + def __ge__(self, other): + if TensorList._iterable(other): + return TensorList([e1 >= e2 for e1, e2 in zip(self, other)]) + return TensorList([e >= other for e in self]) + + def view(self, *args): + def reshape(x): + if isinstance(x, PTensor): + return layers.reshape(x, args) + else: + return np.reshape(x, args) + + return self.apply(reshape) + + def clone(self): + def _clone(x): + if isinstance(x, PTensor): + return clone_fn(x) + else: + return x.copy() + + return self.apply(_clone) + + def detach(self): + return self.apply(detach_fn) + + def sqrt(self): + def _sqrt(x): + if isinstance(x, PTensor): + return layers.sqrt(x) + else: + return np.sqrt(x) + + return self.apply(_sqrt) + + def abs(self): + def _abs(x): + if isinstance(x, PTensor): + return layers.abs(x) + else: + return np.abs(x) + + return self.apply(_abs) + + def size(self, axis=None): + def get_size(x): + if axis is None: + return x.shape + else: + return x.shape[axis] + + return self.apply(get_size) + + def concat(self, other): + return TensorList(super(TensorList, self).__add__(other)) + + def copy(self): + return TensorList(super(TensorList, self).copy()) + + def unroll(self): + if not any(isinstance(t, TensorList) for t in self): + return self + + new_list = TensorList() + for t in self: + if isinstance(t, TensorList): + new_list.extend(t.unroll()) + else: + new_list.append(t) + return new_list + + def attribute(self, attr: str, *args): + return TensorList([getattr(e, attr, *args) for e in self]) + + def apply(self, fn): + return TensorList([fn(e) for e in self]) + + def __getattr__(self, name): + for e in self: + if not hasattr(e, name): + raise AttributeError('\'{}\' object has not attribute \'{}\''. + format(type(e), name)) + + def apply_attr(*args, **kwargs): + return TensorList([getattr(e, name)(*args, **kwargs) for e in self]) + + return apply_attr + + @staticmethod + def _iterable(a): + return isinstance(a, (TensorList, list)) + + +def tensor_operation(op): + def islist(a): + return isinstance(a, TensorList) + + @functools.wraps(op) + def oplist(*args, **kwargs): + if len(args) == 0: + raise ValueError( + 'Must be at least one argument without keyword (i.e. operand).') + + if len(args) == 1: + if islist(args[0]): + return TensorList([op(a, **kwargs) for a in args[0]]) + else: + # Multiple operands, assume max two + if islist(args[0]) and islist(args[1]): + return TensorList( + [op(a, b, *args[2:], **kwargs) for a, b in zip(*args[:2])]) + if islist(args[0]): + return TensorList([op(a, *args[1:], **kwargs) for a in args[0]]) + if islist(args[1]): + return TensorList( + [op(args[0], b, *args[2:], **kwargs) for b in args[1]]) + + # None of the operands are lists + return op(*args, **kwargs) + + return oplist diff --git a/PaddleCV/tracking/pytracking/parameter/atom/__init__.py b/PaddleCV/tracking/pytracking/parameter/atom/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PaddleCV/tracking/pytracking/parameter/atom/default_vot.py b/PaddleCV/tracking/pytracking/parameter/atom/default_vot.py new file mode 100644 index 0000000000000000000000000000000000000000..e59e018785f548b1903f02588caf1ae99b300984 --- /dev/null +++ b/PaddleCV/tracking/pytracking/parameter/atom/default_vot.py @@ -0,0 +1,114 @@ +import numpy as np + +from pytracking.features.deep import ResNet18, ResNet50 +from pytracking.features.extractor import MultiResolutionExtractor +from pytracking.utils import TrackerParams, FeatureParams + + +def parameters(): + params = TrackerParams() + + # These are usually set from outside + params.debug = 0 # Debug level + params.visualization = False # Do visualization + + # Use GPU or not (IoUNet requires this to be True) + params.use_gpu = True + + # Feature specific parameters + deep_params = TrackerParams() + + # Patch sampling parameters + params.max_image_sample_size = (14 * 16)**2 # Maximum image sample size + params.min_image_sample_size = (14 * 16)**2 # Minimum image sample size + params.search_area_scale = 4 # Scale relative to target size + params.feature_size_odd = False # Good to use False for even-sized kernels and vice versa + + # Optimization parameters + params.CG_iter = 5 # The number of Conjugate Gradient iterations in each update after the first frame + params.init_CG_iter = 60 # The total number of Conjugate Gradient iterations used in the first frame + params.init_GN_iter = 6 # The number of Gauss-Newton iterations used in the first frame (only if the projection matrix is updated) + params.post_init_CG_iter = 0 # CG iterations to run after GN + params.fletcher_reeves = False # Use the Fletcher-Reeves (true) or Polak-Ribiere (false) formula in the Conjugate Gradient + params.standard_alpha = True # Use the standard formula for computing the step length in Conjugate Gradient + params.CG_forgetting_rate = None # Forgetting rate of the last conjugate direction + + # Learning parameters for each feature type + deep_params.learning_rate = 0.0075 # Learning rate + deep_params.output_sigma_factor = 1 / 4 # Standard deviation of Gaussian label relative to target size + + # Training parameters + params.sample_memory_size = 250 # Memory size + params.train_skipping = 10 # How often to run training (every n-th frame) + + # Online model parameters + # deep_params.kernel_size = (4, 4) # when slice double grad is support, + # else, deep_params.kernel_size = (5, 5) + deep_params.kernel_size = (5, 5) # Kernel size of filter + deep_params.compressed_dim = 64 # Dimension output of projection matrix + deep_params.filter_reg = 1e-1 # Filter regularization factor + deep_params.projection_reg = 1e-4 # Projection regularization factor + + # Windowing + params.feature_window = False # Perform windowing of features + params.window_output = True # Perform windowing of output scores + + # Detection parameters + params.scale_factors = np.array( + [1], dtype='float32' + ) # What scales to use for localization (only one scale if IoUNet is used) + params.score_upsample_factor = 1 # How much Fourier upsampling to use + + # Init data augmentation parameters + params.augmentation = { + 'fliplr': True, + 'rotate': [5, -5, 10, -10, 20, -20, 30, -30, 45, -45, -60, 60], + 'blur': [(2, 0.2), (0.2, 2), (3, 1), (1, 3), (2, 2)], + 'relativeshift': [(0.6, 0.6), (-0.6, 0.6), (0.6, -0.6), (-0.6, -0.6)], + 'dropout': (7, 0.2) + } + + params.augmentation_expansion_factor = 2 # How much to expand sample when doing augmentation + params.random_shift_factor = 1 / 3 # How much random shift to do on each augmented sample + deep_params.use_augmentation = True # Whether to use augmentation for this feature + + # Factorized convolution parameters + # params.use_projection_matrix = True # Use projection matrix, i.e. use the factorized convolution formulation + params.update_projection_matrix = True # Whether the projection matrix should be optimized or not + params.proj_init_method = 'randn' # Method for initializing the projection matrix + params.filter_init_method = 'randn' # Method for initializing the spatial filter + params.projection_activation = 'none' # Activation function after projection ('none', 'relu', 'elu' or 'mlu') + params.response_activation = ( + 'mlu', 0.05 + ) # Activation function on the output scores ('none', 'relu', 'elu' or 'mlu') + + # Advanced localization parameters + params.advanced_localization = True # Use this or not + params.target_not_found_threshold = -1 # Absolute score threshold to detect target missing + params.distractor_threshold = 100 # Relative threshold to find distractors + params.hard_negative_threshold = 0.3 # Relative threshold to find hard negative samples + params.target_neighborhood_scale = 2.2 # Target neighborhood to remove + params.dispalcement_scale = 0.7 # Dispacement to consider for distractors + params.hard_negative_learning_rate = 0.02 # Learning rate if hard negative detected + params.hard_negative_CG_iter = 5 # Number of optimization iterations to use if hard negative detected + params.update_scale_when_uncertain = True # Update scale or not if distractor is close + + # IoUNet parameters + params.iounet_augmentation = False # Use the augmented samples to compute the modulation vector + params.iounet_k = 3 # Top-k average to estimate final box + params.num_init_random_boxes = 9 # Num extra random boxes in addition to the classifier prediction + params.box_jitter_pos = 0.1 # How much to jitter the translation for random boxes + params.box_jitter_sz = 0.5 # How much to jitter the scale for random boxes + params.maximal_aspect_ratio = 6 # Limit on the aspect ratio + params.box_refinement_iter = 5 # Number of iterations for refining the boxes + params.box_refinement_step_length = 1 # Gradient step length in the bounding box refinement + params.box_refinement_step_decay = 1 # Multiplicative step length decay (1 means no decay) + + # Setup the feature extractor (which includes the IoUNet) + deep_fparams = FeatureParams(feature_params=[deep_params]) + deep_feat = ResNet18( + output_layers=['block2'], fparams=deep_fparams, normalize_power=2) + params.features = MultiResolutionExtractor([deep_feat]) + + params.vot_anno_conversion_type = 'preserve_area' + return params diff --git a/PaddleCV/tracking/pytracking/parameter/siamfc/default.py b/PaddleCV/tracking/pytracking/parameter/siamfc/default.py new file mode 100644 index 0000000000000000000000000000000000000000..d77c11f9a9e5dd3cd856d7a227be3a2cb126c22d --- /dev/null +++ b/PaddleCV/tracking/pytracking/parameter/siamfc/default.py @@ -0,0 +1,56 @@ +import numpy as np + +from pytracking.features import deep +from pytracking.features.extractor import MultiResolutionExtractor +from pytracking.utils import TrackerParams, FeatureParams + + +def parameters(): + params = TrackerParams() + + # These are usually set from outside + params.debug = 0 # Debug level + params.visualization = False # Do visualization + + # Use GPU or not (IoUNet requires this to be True) + params.use_gpu = True + + # Feature specific parameters + deep_params = TrackerParams() + + # Patch sampling parameters + params.exemplar_size = 127 + params.max_image_sample_size = 255 * 255 # Maximum image sample size + params.min_image_sample_size = 255 * 255 # Minimum image sample size + + # Detection parameters + params.scale_factors = 1.0375**np.array( + [-1, 0, 1] + ) # What scales to use for localization (only one scale if IoUNet is used) + params.score_upsample_factor = 16 # How much Fourier upsampling to use + params.scale_penalty = 0.9745 + params.scale_lr = 0.59 + params.window_influence = 0.176 + params.total_stride = 8 + + # Setup the feature extractor (which includes the IoUNet) + deep_fparams = FeatureParams(feature_params=[deep_params]) + deep_feat = deep.SFCAlexnet( + net_path='/ssd2/bily/code/baidu/personal-code/pytracking/ltr/checkpoints/ltr/fs/siamrpn50/SiamRPN_ep0001.pth.tar', + output_layers=['conv5'], + fparams=deep_fparams) + params.features = MultiResolutionExtractor([deep_feat]) + + params.net_path = None + params.response_up = 16 + params.response_sz = 17 + params.context = 0.5 + params.instance_sz = 255 + params.exemplar_sz = 127 + params.scale_num = 3 + params.scale_step = 1.0375 + params.scale_lr = 0.59 + params.scale_penalty = 0.9745 + params.window_influence = 0.176 + params.total_stride = 8 + return params diff --git a/PaddleCV/tracking/pytracking/tracker/__init__.py b/PaddleCV/tracking/pytracking/tracker/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PaddleCV/tracking/pytracking/tracker/atom/__init__.py b/PaddleCV/tracking/pytracking/tracker/atom/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..40ad3b507bba6ab768fcae3db0c5d71724b6627d --- /dev/null +++ b/PaddleCV/tracking/pytracking/tracker/atom/__init__.py @@ -0,0 +1,5 @@ +from .atom import ATOM + + +def get_tracker_class(): + return ATOM diff --git a/PaddleCV/tracking/pytracking/tracker/atom/atom.py b/PaddleCV/tracking/pytracking/tracker/atom/atom.py new file mode 100644 index 0000000000000000000000000000000000000000..9a08cff271f76ac38496f51ece2dcab166a0b137 --- /dev/null +++ b/PaddleCV/tracking/pytracking/tracker/atom/atom.py @@ -0,0 +1,1021 @@ +import math +import os +import time + +import numpy as np +from paddle import fluid +from paddle.fluid import layers + +from pytracking.features import augmentation +from pytracking.libs import dcf, operation, fourier +from pytracking.libs.optimization import ConjugateGradient, GaussNewtonCG, GradientDescentL2 +from pytracking.libs.paddle_utils import mod, n2p, \ + leaky_relu, dropout2d +from pytracking.libs.tensorlist import TensorList +from pytracking.tracker.atom.optim import FactorizedConvProblem, ConvProblem +from pytracking.tracker.base.basetracker import BaseTracker + + +class ATOM(BaseTracker): + def initialize_features(self): + if not getattr(self, 'features_initialized', False): + self.params.features.initialize() + self.features_initialized = True + + def initialize(self, image, state, *args, **kwargs): + # Initialize some stuff + self.frame_num = 1 + # TODO: for now, we don't support explictly setting up device + # if not hasattr(self.params, 'device'): + # self.params.device = 'cuda' if self.params.use_gpu else 'cpu' + + # Initialize features + self.initialize_features() + + # Check if image is color + self.params.features.set_is_color(image.shape[2] == 3) + + # Get feature specific params + self.fparams = self.params.features.get_fparams('feature_params') + + self.time = 0 + tic = time.time() + + # Get position and size + self.pos = np.array( + [state[1] + (state[3] - 1) / 2, state[0] + (state[2] - 1) / 2], + 'float32') + self.target_sz = np.array([state[3], state[2]], 'float32') + + # Set search area + self.target_scale = 1.0 + search_area = np.prod(self.target_sz * self.params.search_area_scale) + if search_area > self.params.max_image_sample_size: + self.target_scale = math.sqrt(search_area / + self.params.max_image_sample_size) + elif search_area < self.params.min_image_sample_size: + self.target_scale = math.sqrt(search_area / + self.params.min_image_sample_size) + + # Check if IoUNet is used + self.use_iou_net = getattr(self.params, 'use_iou_net', True) + + # Target size in base scale + self.base_target_sz = self.target_sz / self.target_scale + + # Use odd square search area and set sizes + feat_max_stride = max(self.params.features.stride()) + if getattr(self.params, 'search_area_shape', 'square') == 'square': + self.img_sample_sz = np.ones((2, ), 'float32') * np.round( + np.sqrt( + np.prod(self.base_target_sz * + self.params.search_area_scale))) + elif self.params.search_area_shape == 'initrect': + self.img_sample_sz = np.round(self.base_target_sz * + self.params.search_area_scale) + else: + raise ValueError('Unknown search area shape') + if self.params.feature_size_odd: + self.img_sample_sz += feat_max_stride - mod(self.img_sample_sz, + (2 * feat_max_stride)) + else: + self.img_sample_sz += feat_max_stride - mod( + (self.img_sample_sz + feat_max_stride), (2 * feat_max_stride)) + + # Set sizes + self.img_support_sz = self.img_sample_sz + self.feature_sz = self.params.features.size(self.img_sample_sz) + self.output_sz = self.params.score_upsample_factor * self.img_support_sz # Interpolated size of the output + self.kernel_size = self.fparams.attribute('kernel_size') + + self.iou_img_sample_sz = self.img_sample_sz + + # Optimization options + self.params.precond_learning_rate = self.fparams.attribute( + 'learning_rate') + if self.params.CG_forgetting_rate is None or max( + self.params.precond_learning_rate) >= 1: + self.params.direction_forget_factor = 0 + else: + self.params.direction_forget_factor = ( + 1 - max(self.params.precond_learning_rate) + )**self.params.CG_forgetting_rate + + self.output_window = None + if getattr(self.params, 'window_output', False): + if getattr(self.params, 'use_clipped_window', False): + self.output_window = dcf.hann2d_clipped( + self.output_sz.astype('long'), + self.output_sz.astype('long') * + self.params.effective_search_area / + self.params.search_area_scale, + centered=False) + else: + self.output_window = dcf.hann2d( + self.output_sz.astype('long'), centered=False) + + # Initialize some learning things + self.init_learning() + + # Convert image + im = image.astype('float32') + self.im = im # For debugging only + + # Setup scale bounds + self.image_sz = np.array([im.shape[0], im.shape[1]], 'float32') + self.min_scale_factor = np.max(10 / self.base_target_sz) + self.max_scale_factor = np.min(self.image_sz / self.base_target_sz) + + # Extract and transform sample + x = self.generate_init_samples(im) + + # Initialize iounet + if self.use_iou_net: + self.init_iou_net() + + # Initialize projection matrix + self.init_projection_matrix(x) + + # Transform to get the training sample + train_x = self.preprocess_sample(x) + + # Generate label function + init_y = self.init_label_function(train_x) + + # Init memory + self.init_memory(train_x) + + # Init optimizer and do initial optimization + self.init_optimization(train_x, init_y) + + self.pos_iounet = self.pos.copy() + + self.time += time.time() - tic + + def track(self, image): + + self.frame_num += 1 + + # Convert image + # im = numpy_to_paddle(image) + im = image.astype('float32') + self.im = im # For debugging only + + # ------- LOCALIZATION ------- # + + # Get sample + sample_pos = self.pos.round() + sample_scales = self.target_scale * self.params.scale_factors + + test_x = self.extract_processed_sample(im, self.pos, sample_scales, + self.img_sample_sz) + + # Compute scores + scores_raw = self.apply_filter(test_x) + translation_vec, scale_ind, s, flag = self.localize_target(scores_raw) + + # Update position and scale + if flag != 'not_found': + if self.use_iou_net: + update_scale_flag = getattr(self.params, + 'update_scale_when_uncertain', + True) or flag != 'uncertain' + if getattr(self.params, 'use_classifier', True): + self.update_state(sample_pos + translation_vec) + self.refine_target_box(sample_pos, sample_scales[scale_ind], + scale_ind, update_scale_flag) + elif getattr(self.params, 'use_classifier', True): + self.update_state(sample_pos + translation_vec, + sample_scales[scale_ind]) + + # ------- UPDATE ------- # + + # Check flags and set learning rate if hard negative + update_flag = flag not in ['not_found', 'uncertain'] + hard_negative = (flag == 'hard_negative') + learning_rate = self.params.hard_negative_learning_rate if hard_negative else None + + if update_flag: + # Get train sample + train_x = TensorList([x[scale_ind:scale_ind + 1] for x in test_x]) + + # Create label for sample + train_y = self.get_label_function(sample_pos, + sample_scales[scale_ind]) + + # Update memory + self.update_memory(train_x, train_y, learning_rate) + + # Train filter + if hard_negative: + self.filter_optimizer.run(self.params.hard_negative_CG_iter) + elif (self.frame_num - 1) % self.params.train_skipping == 0: + self.filter_optimizer.run(self.params.CG_iter) + self.filter = self.filter_optimizer.x + + # Set the pos of the tracker to iounet pos + if self.use_iou_net and flag != 'not_found': + self.pos = self.pos_iounet.copy() + + # Return new state + yx = self.pos - (self.target_sz - 1) / 2 + new_state = np.array( + [yx[1], yx[0], self.target_sz[1], self.target_sz[0]], 'float32') + + return new_state.tolist() + + def update_memory(self, + sample_x: TensorList, + sample_y: TensorList, + learning_rate=None): + replace_ind = self.update_sample_weights( + self.sample_weights, self.previous_replace_ind, + self.num_stored_samples, self.num_init_samples, self.fparams, + learning_rate) + self.previous_replace_ind = replace_ind + for train_samp, x, ind in zip(self.training_samples, sample_x, + replace_ind): + train_samp[ind] = x[0] + for y_memory, y, ind in zip(self.y, sample_y, replace_ind): + y_memory[ind] = y[0] + if self.hinge_mask is not None: + for m, y, ind in zip(self.hinge_mask, sample_y, replace_ind): + m[ind] = layers.cast(y >= self.params.hinge_threshold, + 'float32')[0] + self.num_stored_samples += 1 + + def update_sample_weights(self, + sample_weights, + previous_replace_ind, + num_stored_samples, + num_init_samples, + fparams, + learning_rate=None): + # Update weights and get index to replace in memory + replace_ind = [] + for sw, prev_ind, num_samp, num_init, fpar in zip( + sample_weights, previous_replace_ind, num_stored_samples, + num_init_samples, fparams): + lr = learning_rate + if lr is None: + lr = fpar.learning_rate + + init_samp_weight = getattr(fpar, 'init_samples_minimum_weight', + None) + if init_samp_weight == 0: + init_samp_weight = None + s_ind = 0 if init_samp_weight is None else num_init + + if num_samp == 0 or lr == 1: + sw[:] = 0 + sw[0] = 1 + r_ind = 0 + else: + # Get index to replace + r_ind = np.argmin(sw[s_ind:], 0) + r_ind = int(r_ind + s_ind) + + # Update weights + if prev_ind is None: + sw /= 1 - lr + sw[r_ind] = lr + else: + sw[r_ind] = sw[prev_ind] / (1 - lr) + + sw /= sw.sum() + if init_samp_weight is not None and sw[:num_init].sum( + ) < init_samp_weight: + sw /= init_samp_weight + sw[num_init:].sum() + sw[:num_init] = init_samp_weight / num_init + + replace_ind.append(r_ind) + + return replace_ind + + def localize_target(self, scores_raw): + # Weighted sum (if multiple features) with interpolation in fourier domain + weight = self.fparams.attribute('translation_weight', 1.0) + scores_raw = weight * scores_raw + sf_weighted = fourier.cfft2(scores_raw) / (scores_raw.size(2) * + scores_raw.size(3)) + for i, (sz, ksz) in enumerate(zip(self.feature_sz, self.kernel_size)): + sf_weighted[i] = fourier.shift_fs(sf_weighted[i], math.pi * ( + 1 - np.array([ksz[0] % 2, ksz[1] % 2]) / sz)) + + scores_fs = fourier.sum_fs(sf_weighted) + scores = fourier.sample_fs(scores_fs, self.output_sz) + + if self.output_window is not None and not getattr( + self.params, 'perform_hn_without_windowing', False): + scores *= self.output_window + + if getattr(self.params, 'advanced_localization', False): + return self.localize_advanced(scores) + + # Get maximum + max_score, max_disp = dcf.max2d(scores) + scale_ind = np.argmax(max_score, axis=0)[0] + max_disp = max_disp.astype('float32') + + # Convert to displacements in the base scale + output_sz = self.output_sz.copy() + disp = mod((max_disp + output_sz / 2), output_sz) - output_sz / 2 + + # Compute translation vector and scale change factor + translation_vec = np.reshape( + disp[scale_ind].astype('float32'), [-1]) * ( + self.img_support_sz / self.output_sz) * self.target_scale + translation_vec *= self.params.scale_factors[scale_ind] + + # Shift the score output for visualization purposes + if self.params.debug >= 2: + sz = scores.shape[-2:] + scores = np.concatenate( + [scores[..., sz[0] // 2:, :], scores[..., :sz[0] // 2, :]], -2) + scores = np.concatenate( + [scores[..., sz[1] // 2:], scores[..., :sz[1] // 2]], -1) + + return translation_vec, scale_ind, scores, None + + def update_state(self, new_pos, new_scale=None): + # Update scale + if new_scale is not None: + self.target_scale = np.clip(new_scale, self.min_scale_factor, + self.max_scale_factor) + self.target_sz = self.base_target_sz * self.target_scale + + # Update pos + inside_ratio = 0.2 + inside_offset = (inside_ratio - 0.5) * self.target_sz + self.pos = np.maximum( + np.minimum(new_pos, + self.image_sz.astype('float32') - inside_offset), + inside_offset) + + def get_label_function(self, sample_pos, sample_scale): + # Generate label function + train_y = TensorList() + target_center_norm = (self.pos - sample_pos) / (self.img_support_sz * + sample_scale) + for sig, sz, ksz in zip(self.sigma, self.feature_sz, self.kernel_size): + center = sz * target_center_norm + 0.5 * np.array( + [(ksz[0] + 1) % 2, (ksz[1] + 1) % 2], 'float32') + train_y.append(dcf.label_function_spatial(sz, sig, center)) + return train_y + + def extract_sample(self, + im: np.ndarray, + pos: np.ndarray, + scales, + sz: np.ndarray, + debug_save_name): + return self.params.features.extract(im, pos, scales, sz, + debug_save_name) + + def extract_processed_sample(self, + im: np.ndarray, + pos: np.ndarray, + scales, + sz: np.ndarray, + debug_save_name=None) -> (TensorList, + TensorList): + x = self.extract_sample(im, pos, scales, sz, debug_save_name) + return self.preprocess_sample(self.project_sample(x)) + + def apply_filter(self, sample_x: TensorList): + with fluid.dygraph.guard(): + sample_x = sample_x.apply(n2p) + filter = self.filter.apply(n2p) + return operation.conv2d(sample_x, filter, mode='same').numpy() + + def init_projection_matrix(self, x): + # Set if using projection matrix + self.params.use_projection_matrix = getattr( + self.params, 'use_projection_matrix', True) + + if self.params.use_projection_matrix: + self.compressed_dim = self.fparams.attribute('compressed_dim', None) + + proj_init_method = getattr(self.params, 'proj_init_method', 'pca') + if proj_init_method == 'pca': + raise NotImplementedError + elif proj_init_method == 'randn': + with fluid.dygraph.guard(): + self.projection_matrix = TensorList([ + None if cdim is None else layers.gaussian_random( + (cdim, ex.shape[1], 1, 1), 0.0, + 1 / math.sqrt(ex.shape[1])).numpy() + for ex, cdim in zip(x, self.compressed_dim) + ]) + elif proj_init_method == 'np_randn': + rng = np.random.RandomState(0) + self.projection_matrix = TensorList([ + None if cdim is None else rng.normal( + size=(cdim, ex.shape[1], 1, 1), + loc=0.0, + scale=1 / math.sqrt(ex.shape[1])).astype('float32') + for ex, cdim in zip(x, self.compressed_dim) + ]) + elif proj_init_method == 'ones': + self.projection_matrix = TensorList([ + None if cdim is None else + np.ones((cdim, ex.shape[1], 1, 1), + 'float32') / math.sqrt(ex.shape[1]) + for ex, cdim in zip(x, self.compressed_dim) + ]) + else: + self.compressed_dim = x.size(1) + self.projection_matrix = TensorList([None] * len(x)) + + def preprocess_sample(self, x: TensorList) -> (TensorList, TensorList): + if getattr(self.params, '_feature_window', False): + x = x * self.feature_window + return x + + def init_label_function(self, train_x): + # Allocate label function + self.y = TensorList([ + np.zeros( + [self.params.sample_memory_size, 1, x.shape[2], x.shape[3]], + 'float32') for x in train_x + ]) + + # Output sigma factor + output_sigma_factor = self.fparams.attribute('output_sigma_factor') + self.sigma = output_sigma_factor * np.ones((2, ), 'float32') * ( + self.feature_sz / self.img_support_sz * + self.base_target_sz).apply(np.prod).apply(np.sqrt) + + # Center pos in normalized coords + target_center_norm = (self.pos - np.round(self.pos)) / ( + self.target_scale * self.img_support_sz) + + # Generate label functions + for y, sig, sz, ksz, x in zip(self.y, self.sigma, self.feature_sz, + self.kernel_size, train_x): + center_pos = sz * target_center_norm + 0.5 * np.array( + [(ksz[0] + 1) % 2, (ksz[1] + 1) % 2], 'float32') + for i, T in enumerate(self.transforms[:x.shape[0]]): + sample_center = center_pos + np.array( + T.shift, 'float32') / self.img_support_sz * sz + y[i] = dcf.label_function_spatial(sz, sig, sample_center) + + # Return only the ones to use for initial training + return TensorList([y[:x.shape[0]] for y, x in zip(self.y, train_x)]) + + def init_memory(self, train_x): + # Initialize first-frame training samples + self.num_init_samples = train_x.size(0) + self.init_sample_weights = TensorList( + [np.ones(x.shape[0], 'float32') / x.shape[0] for x in train_x]) + self.init_training_samples = train_x + + # Sample counters and weights + self.num_stored_samples = self.num_init_samples.copy() + self.previous_replace_ind = [None] * len(self.num_stored_samples) + self.sample_weights = TensorList([ + np.zeros(self.params.sample_memory_size, 'float32') for x in train_x + ]) + for sw, init_sw, num in zip(self.sample_weights, + self.init_sample_weights, + self.num_init_samples): + sw[:num] = init_sw + + # Initialize memory + self.training_samples = TensorList( + [[np.zeros([cdim, x.shape[2], x.shape[3]], 'float32')] * + self.params.sample_memory_size + for x, cdim in zip(train_x, self.compressed_dim)]) + + def init_learning(self): + # Get window function + self.feature_window = TensorList( + [dcf.hann2d(sz) for sz in self.feature_sz]) + + # Filter regularization + self.filter_reg = self.fparams.attribute('filter_reg') + + # Activation function after the projection matrix (phi_1 in the paper) + projection_activation = getattr(self.params, 'projection_activation', + 'none') + if isinstance(projection_activation, tuple): + projection_activation, act_param = projection_activation + + if projection_activation == 'none': + self.projection_activation = lambda x: x + elif projection_activation == 'relu': + self.projection_activation = layers.relu + elif projection_activation == 'elu': + self.projection_activation = layers.elu + elif projection_activation == 'mlu': + self.projection_activation = lambda x: layers.elu(leaky_relu(x, 1 / act_param), act_param) + else: + raise ValueError('Unknown activation') + + # Activation function after the output scores (phi_2 in the paper) + response_activation = getattr(self.params, 'response_activation', + 'none') + if isinstance(response_activation, tuple): + response_activation, act_param = response_activation + + if response_activation == 'none': + self.response_activation = lambda x: x + elif response_activation == 'relu': + self.response_activation = layers.relu + elif response_activation == 'elu': + self.response_activation = layers.elu + elif response_activation == 'mlu': + self.response_activation = lambda x: layers.elu(leaky_relu(x, 1 / act_param), act_param) + else: + raise ValueError('Unknown activation') + + def generate_init_samples(self, im: np.ndarray) -> TensorList: + """Generate augmented initial samples.""" + + # Compute augmentation size + aug_expansion_factor = getattr(self.params, + 'augmentation_expansion_factor', None) + aug_expansion_sz = self.img_sample_sz.copy() + aug_output_sz = None + if aug_expansion_factor is not None and aug_expansion_factor != 1: + aug_expansion_sz = (self.img_sample_sz * + aug_expansion_factor).astype('long') + aug_expansion_sz += ( + aug_expansion_sz - self.img_sample_sz.astype('long')) % 2 + aug_expansion_sz = aug_expansion_sz.astype('float32') + aug_output_sz = self.img_sample_sz.astype('long').tolist() + + # Random shift operator + get_rand_shift = lambda: None + random_shift_factor = getattr(self.params, 'random_shift_factor', 0) + if random_shift_factor > 0: + get_rand_shift = lambda: ((np.random.uniform(size=[2]) - 0.5) * self.img_sample_sz * random_shift_factor).astype('long').tolist() + + # Create transofmations + self.transforms = [augmentation.Identity(aug_output_sz)] + if 'shift' in self.params.augmentation: + self.transforms.extend([ + augmentation.Translation(shift, aug_output_sz) + for shift in self.params.augmentation['shift'] + ]) + if 'relativeshift' in self.params.augmentation: + get_absolute = lambda shift: (np.array(shift, 'float32') * self.img_sample_sz / 2).astype('long').tolist() + self.transforms.extend([ + augmentation.Translation(get_absolute(shift), aug_output_sz) + for shift in self.params.augmentation['relativeshift'] + ]) + if 'fliplr' in self.params.augmentation and self.params.augmentation[ + 'fliplr']: + self.transforms.append( + augmentation.FlipHorizontal(aug_output_sz, get_rand_shift())) + if 'blur' in self.params.augmentation: + self.transforms.extend([ + augmentation.Blur(sigma, aug_output_sz, get_rand_shift()) + for sigma in self.params.augmentation['blur'] + ]) + if 'scale' in self.params.augmentation: + self.transforms.extend([ + augmentation.Scale(scale_factor, aug_output_sz, + get_rand_shift()) + for scale_factor in self.params.augmentation['scale'] + ]) + if 'rotate' in self.params.augmentation: + self.transforms.extend([ + augmentation.Rotate(angle, aug_output_sz, get_rand_shift()) + for angle in self.params.augmentation['rotate'] + ]) + + # Generate initial samples + init_samples = self.params.features.extract_transformed( + im, self.pos, self.target_scale, aug_expansion_sz, self.transforms) + + # Remove augmented samples for those that shall not have + for i, use_aug in enumerate(self.fparams.attribute('use_augmentation')): + if not use_aug: + init_samples[i] = init_samples[i][0:1] + + # Add dropout samples + if 'dropout' in self.params.augmentation: + num, prob = self.params.augmentation['dropout'] + self.transforms.extend(self.transforms[:1] * num) + with fluid.dygraph.guard(): + for i, use_aug in enumerate( + self.fparams.attribute('use_augmentation')): + if use_aug: + init_samples[i] = np.concatenate([ + init_samples[i], dropout2d( + layers.expand( + n2p(init_samples[i][0:1]), (num, 1, 1, 1)), + prob, + is_train=True).numpy() + ]) + + return init_samples + + def init_optimization(self, train_x, init_y): + # Initialize filter + filter_init_method = getattr(self.params, 'filter_init_method', 'zeros') + self.filter = TensorList([ + np.zeros([1, cdim, sz[0], sz[1]], 'float32') + for x, cdim, sz in zip(train_x, self.compressed_dim, + self.kernel_size) + ]) + if filter_init_method == 'zeros': + pass + elif filter_init_method == 'ones': + for idx, f in enumerate(self.filter): + self.filter[idx] = np.ones(f.shape, + 'float32') / np.prod(f.shape) + elif filter_init_method == 'np_randn': + rng = np.random.RandomState(0) + for idx, f in enumerate(self.filter): + self.filter[idx] = rng.normal( + size=f.shape, loc=0, + scale=1 / np.prod(f.shape)).astype('float32') + elif filter_init_method == 'randn': + for idx, f in enumerate(self.filter): + with fluid.dygraph.guard(): + self.filter[idx] = layers.gaussian_random( + f.shape, std=1 / np.prod(f.shape)).numpy() + else: + raise ValueError('Unknown "filter_init_method"') + + # Get parameters + self.params.update_projection_matrix = getattr( + self.params, 'update_projection_matrix', + True) and self.params.use_projection_matrix + optimizer = getattr(self.params, 'optimizer', 'GaussNewtonCG') + + # Setup factorized joint optimization + if self.params.update_projection_matrix: + self.joint_problem = FactorizedConvProblem( + self.init_training_samples, init_y, self.filter_reg, + self.fparams.attribute('projection_reg'), self.params, + self.init_sample_weights, self.projection_activation, + self.response_activation) + + # Variable containing both filter and projection matrix + joint_var = self.filter.concat(self.projection_matrix) + + # Initialize optimizer + analyze_convergence = getattr(self.params, 'analyze_convergence', + False) + if optimizer == 'GaussNewtonCG': + self.joint_optimizer = GaussNewtonCG( + self.joint_problem, + joint_var, + plotting=(self.params.debug >= 3), + analyze=True, + fig_num=(12, 13, 14)) + elif optimizer == 'GradientDescentL2': + self.joint_optimizer = GradientDescentL2( + self.joint_problem, + joint_var, + self.params.optimizer_step_length, + self.params.optimizer_momentum, + plotting=(self.params.debug >= 3), + debug=analyze_convergence, + fig_num=(12, 13)) + + # Do joint optimization + if isinstance(self.params.init_CG_iter, (list, tuple)): + self.joint_optimizer.run(self.params.init_CG_iter) + else: + self.joint_optimizer.run(self.params.init_CG_iter // + self.params.init_GN_iter, + self.params.init_GN_iter) + + # Get back filter and optimizer + len_x = len(self.joint_optimizer.x) + self.filter = self.joint_optimizer.x[:len_x // 2] # w2 in paper + self.projection_matrix = self.joint_optimizer.x[len_x // + 2:] # w1 in paper + + if analyze_convergence: + opt_name = 'CG' if getattr(self.params, 'CG_optimizer', + True) else 'GD' + for val_name, values in zip(['loss', 'gradient'], [ + self.joint_optimizer.losses, + self.joint_optimizer.gradient_mags + ]): + val_str = ' '.join( + ['{:.8e}'.format(v.item()) for v in values]) + file_name = '{}_{}.txt'.format(opt_name, val_name) + with open(file_name, 'a') as f: + f.write(val_str + '\n') + raise RuntimeError('Exiting') + + # Re-project samples with the new projection matrix + compressed_samples = self.project_sample(self.init_training_samples, + self.projection_matrix) + for train_samp, init_samp in zip(self.training_samples, + compressed_samples): + for idx in range(init_samp.shape[0]): + train_samp[idx] = init_samp[idx] + + self.hinge_mask = None + + # Initialize optimizer + self.conv_problem = ConvProblem(self.training_samples, self.y, + self.filter_reg, self.sample_weights, + self.response_activation) + + if optimizer == 'GaussNewtonCG': + self.filter_optimizer = ConjugateGradient( + self.conv_problem, + self.filter, + fletcher_reeves=self.params.fletcher_reeves, + direction_forget_factor=self.params.direction_forget_factor, + debug=(self.params.debug >= 3), + fig_num=(12, 13)) + elif optimizer == 'GradientDescentL2': + self.filter_optimizer = GradientDescentL2( + self.conv_problem, + self.filter, + self.params.optimizer_step_length, + self.params.optimizer_momentum, + debug=(self.params.debug >= 3), + fig_num=12) + + # Transfer losses from previous optimization + if self.params.update_projection_matrix: + self.filter_optimizer.residuals = self.joint_optimizer.residuals + self.filter_optimizer.losses = self.joint_optimizer.losses + + if not self.params.update_projection_matrix: + self.filter_optimizer.run(self.params.init_CG_iter) + + # Post optimization + self.filter_optimizer.run(self.params.post_init_CG_iter) + self.filter = self.filter_optimizer.x + + # Free memory + del self.init_training_samples + if self.params.use_projection_matrix: + del self.joint_problem, self.joint_optimizer + + def project_sample(self, x: TensorList, proj_matrix=None): + # Apply projection matrix + if proj_matrix is None: + proj_matrix = self.projection_matrix + with fluid.dygraph.guard(): + return operation.conv2d(x.apply(n2p), proj_matrix.apply(n2p)).apply( + self.projection_activation).numpy() + + def get_iounet_box(self, pos, sz, sample_pos, sample_scale): + """All inputs in original image coordinates""" + box_center = (pos - sample_pos) / sample_scale + (self.iou_img_sample_sz + - 1) / 2 + box_sz = sz / sample_scale + target_ul = box_center - (box_sz - 1) / 2 + return np.concatenate([np.flip(target_ul, 0), np.flip(box_sz, 0)]) + + def get_iou_features(self): + return self.params.features.get_unique_attribute('iounet_features') + + def get_iou_backbone_features(self): + return self.params.features.get_unique_attribute( + 'iounet_backbone_features') + + def init_iou_net(self): + # Setup IoU net + self.iou_predictor = self.params.features.get_unique_attribute( + 'iou_predictor') + + # Get target boxes for the different augmentations + self.iou_target_box = self.get_iounet_box(self.pos, self.target_sz, + self.pos.round(), + self.target_scale) + target_boxes = TensorList() + if self.params.iounet_augmentation: + for T in self.transforms: + if not isinstance( + T, (augmentation.Identity, augmentation.Translation, + augmentation.FlipHorizontal, + augmentation.FlipVertical, augmentation.Blur)): + break + target_boxes.append(self.iou_target_box + np.array( + [T.shift[1], T.shift[0], 0, 0])) + else: + target_boxes.append(self.iou_target_box.copy()) + target_boxes = np.concatenate(target_boxes.view(1, 4), 0) + + # Get iou features + iou_backbone_features = self.get_iou_backbone_features() + + # Remove other augmentations such as rotation + iou_backbone_features = TensorList( + [x[:target_boxes.shape[0], ...] for x in iou_backbone_features]) + + # Extract target feat + with fluid.dygraph.guard(): + iou_backbone_features = iou_backbone_features.apply(n2p) + target_boxes = n2p(target_boxes) + target_feat = self.iou_predictor.get_filter(iou_backbone_features, + target_boxes) + self.target_feat = TensorList( + [layers.reduce_mean(x, 0).numpy() for x in target_feat]) + + if getattr(self.params, 'iounet_not_use_reference', False): + self.target_feat = TensorList([ + np.full_like(tf, tf.norm() / tf.numel()) + for tf in self.target_feat + ]) + + def optimize_boxes(self, iou_features, init_boxes): + with fluid.dygraph.guard(): + # Optimize iounet boxes + init_boxes = np.reshape(init_boxes, (1, -1, 4)) + step_length = self.params.box_refinement_step_length + + target_feat = self.target_feat.apply(n2p) + iou_features = iou_features.apply(n2p) + output_boxes = n2p(init_boxes) + + for f in iou_features: + f.stop_gradient = False + for i_ in range(self.params.box_refinement_iter): + # forward pass + bb_init = output_boxes + bb_init.stop_gradient = False + + outputs = self.iou_predictor.predict_iou(target_feat, + iou_features, bb_init) + + if isinstance(outputs, (list, tuple)): + outputs = outputs[0] + + outputs.backward() + + # Update proposal + bb_init_np = bb_init.numpy() + bb_init_gd = bb_init.gradient() + output_boxes = bb_init_np + step_length * bb_init_gd * np.tile( + bb_init_np[:, :, 2:], (1, 1, 2)) + output_boxes = n2p(output_boxes) + step_length *= self.params.box_refinement_step_decay + + return layers.reshape(output_boxes, ( + -1, 4)).numpy(), layers.reshape(outputs, (-1, )).numpy() + + def refine_target_box(self, + sample_pos, + sample_scale, + scale_ind, + update_scale=True): + # Initial box for refinement + init_box = self.get_iounet_box(self.pos, self.target_sz, sample_pos, + sample_scale) + + # Extract features from the relevant scale + iou_features = self.get_iou_features() + iou_features = TensorList( + [x[scale_ind:scale_ind + 1, ...] for x in iou_features]) + + init_boxes = np.reshape(init_box, (1, 4)).copy() + + rand_fn = lambda a, b: np.random.rand(a, b).astype('float32') + + if self.params.num_init_random_boxes > 0: + # Get random initial boxes + square_box_sz = np.sqrt(init_box[2:].prod()) + rand_factor = square_box_sz * np.concatenate([ + self.params.box_jitter_pos * np.ones(2), + self.params.box_jitter_sz * np.ones(2) + ]) + minimal_edge_size = init_box[2:].min() / 3 + rand_bb = (rand_fn(self.params.num_init_random_boxes, 4) - 0.5 + ) * rand_factor + new_sz = np.clip(init_box[2:] + rand_bb[:, 2:], minimal_edge_size, + 1e10) + new_center = (init_box[:2] + init_box[2:] / 2) + rand_bb[:, :2] + init_boxes = np.concatenate([new_center - new_sz / 2, new_sz], 1) + init_boxes = np.concatenate( + [np.reshape(init_box, (1, 4)), init_boxes]) + + # Refine boxes by maximizing iou + output_boxes, output_iou = self.optimize_boxes(iou_features, init_boxes) + + # Remove weird boxes with extreme aspect ratios + output_boxes[:, 2:] = np.clip(output_boxes[:, 2:], 1, 1e10) + aspect_ratio = output_boxes[:, 2] / output_boxes[:, 3] + keep_ind = (aspect_ratio < self.params.maximal_aspect_ratio) * \ + (aspect_ratio > 1 / self.params.maximal_aspect_ratio) + output_boxes = output_boxes[keep_ind, :] + output_iou = output_iou[keep_ind] + + # If no box found + if output_boxes.shape[0] == 0: + return + + # Take average of top k boxes + k = getattr(self.params, 'iounet_k', 5) + topk = min(k, output_boxes.shape[0]) + inds = np.argsort(-output_iou)[:topk] + predicted_box = np.mean(output_boxes[inds, :], axis=0) + predicted_iou = np.mean( + np.reshape(output_iou, (-1, 1))[inds, :], axis=0) + + # Update position + new_pos = predicted_box[:2] + predicted_box[2:] / 2 - ( + self.iou_img_sample_sz - 1) / 2 + new_pos = np.flip(new_pos, 0) * sample_scale + sample_pos + new_target_sz = np.flip(predicted_box[2:], 0) * sample_scale + new_scale = np.sqrt( + np.prod(new_target_sz) / np.prod(self.base_target_sz)) + + self.pos_iounet = new_pos.copy() + + if getattr(self.params, 'use_iounet_pos_for_learning', True): + self.pos = new_pos.copy() + + self.target_sz = new_target_sz + + if update_scale: + self.target_scale = new_scale + + def localize_advanced(self, scores): + """Does the advanced localization with hard negative detection and target not found.""" + + sz = scores.shape[-2:] + + if self.output_window is not None and getattr( + self.params, 'perform_hn_without_windowing', False): + scores_orig = scores.copy() + + scores_orig = np.concatenate([ + scores_orig[..., (sz[0] + 1) // 2:, :], + scores_orig[..., :(sz[0] + 1) // 2, :] + ], -2) + scores_orig = np.concatenate([ + scores_orig[..., :, (sz[1] + 1) // 2:], + scores_orig[..., :, :(sz[1] + 1) // 2] + ], -1) + + scores *= self.output_window + + # Shift scores back + scores = np.concatenate([ + scores[..., (sz[0] + 1) // 2:, :], scores[..., :(sz[0] + 1) // 2, :] + ], -2) + scores = np.concatenate([ + scores[..., :, (sz[1] + 1) // 2:], scores[..., :, :(sz[1] + 1) // 2] + ], -1) + + # Find maximum + max_score1, max_disp1 = dcf.max2d(scores) + scale_ind = np.argmax(max_score1, axis=0)[0] + max_score1 = max_score1[scale_ind] + max_disp1 = np.reshape(max_disp1[scale_ind].astype('float32'), (-1)) + + target_disp1 = max_disp1 - self.output_sz // 2 + translation_vec1 = target_disp1 * (self.img_support_sz / + self.output_sz) * self.target_scale + + if max_score1 < self.params.target_not_found_threshold: + return translation_vec1, scale_ind, scores, 'not_found' + + if self.output_window is not None and getattr( + self.params, 'perform_hn_without_windowing', False): + scores = scores_orig + + # Mask out target neighborhood + target_neigh_sz = self.params.target_neighborhood_scale * self.target_sz / self.target_scale + tneigh_top = int(max(round(max_disp1[0] - target_neigh_sz[0] / 2), 0)) + tneigh_bottom = int( + min(round(max_disp1[0] + target_neigh_sz[0] / 2 + 1), sz[0])) + tneigh_left = int(max(round(max_disp1[1] - target_neigh_sz[1] / 2), 0)) + tneigh_right = int( + min(round(max_disp1[1] + target_neigh_sz[1] / 2 + 1), sz[1])) + scores_masked = scores[scale_ind:scale_ind + 1, ...].copy() + scores_masked[..., tneigh_top:tneigh_bottom, tneigh_left: + tneigh_right] = 0 + + # Find new maximum + max_score2, max_disp2 = dcf.max2d(scores_masked) + max_disp2 = np.reshape(max_disp2.astype('float32'), (-1)) + target_disp2 = max_disp2 - self.output_sz // 2 + translation_vec2 = target_disp2 * (self.img_support_sz / + self.output_sz) * self.target_scale + + # Handle the different cases + if max_score2 > self.params.distractor_threshold * max_score1: + disp_norm1 = np.sqrt(np.sum(target_disp1**2)) + disp_norm2 = np.sqrt(np.sum(target_disp2**2)) + disp_threshold = self.params.dispalcement_scale * math.sqrt( + sz[0] * sz[1]) / 2 + + if disp_norm2 > disp_threshold and disp_norm1 < disp_threshold: + return translation_vec1, scale_ind, scores, 'hard_negative' + if disp_norm2 < disp_threshold and disp_norm1 > disp_threshold: + return translation_vec2, scale_ind, scores, 'hard_negative' + if disp_norm2 > disp_threshold and disp_norm1 > disp_threshold: + return translation_vec1, scale_ind, scores, 'uncertain' + + # If also the distractor is close, return with highest score + return translation_vec1, scale_ind, scores, 'uncertain' + + if max_score2 > self.params.hard_negative_threshold * max_score1 and max_score2 > self.params.target_not_found_threshold: + return translation_vec1, scale_ind, scores, 'hard_negative' + + return translation_vec1, scale_ind, scores, None diff --git a/PaddleCV/tracking/pytracking/tracker/atom/optim.py b/PaddleCV/tracking/pytracking/tracker/atom/optim.py new file mode 100644 index 0000000000000000000000000000000000000000..8c25c43d6123187baf9ffbffef5b75f2e016a4dc --- /dev/null +++ b/PaddleCV/tracking/pytracking/tracker/atom/optim.py @@ -0,0 +1,243 @@ +import numpy as np +from paddle.fluid import layers +from paddle import fluid + +from pytracking.libs import optimization, TensorList, operation +from pytracking.libs.paddle_utils import PTensor, broadcast_op, n2p, static_identity +import math + + +def stack_input(e): + if isinstance(e, list): + e_exist = [] + for x in e: + if x is not None: + e_exist.append(x) + e = np.stack(e_exist) + else: + assert isinstance(e, np.ndarray) + if len(e.shape) == 1: + e = np.expand_dims(e, 1) + return e + + +class FactorizedConvProblem(optimization.L2Problem): + def __init__(self, + training_samples: TensorList, + y: TensorList, + filter_reg: TensorList, + projection_reg, + params, + sample_weights: TensorList, + projection_activation, + response_activation): + self.training_samples = training_samples + self.y = y + self.filter_reg = filter_reg + self.sample_weights = sample_weights + self.params = params + self.projection_reg = projection_reg + self.projection_activation = projection_activation + self.response_activation = response_activation + + self.diag_M = self.filter_reg.concat(projection_reg) + + self.inputs_dict = {} + # stack tensors + self.training_samples_stack = None + self.y_stack = None + self.sample_weights_stack = None + + def get_inputs(self, scope=''): + if scope not in self.inputs_dict: + training_samples_p = TensorList([ + fluid.layers.data( + '{}training_samples_{}'.format(scope, idx), + shape=[None] + list(v[0].shape), + stop_gradient=False, + append_batch_size=False) + for idx, v in enumerate(self.training_samples) + ]) + y_p = TensorList([ + fluid.layers.data( + '{}y_{}'.format(scope, idx), + shape=[None] + list(v[0].shape), + stop_gradient=False, + append_batch_size=False) for idx, v in enumerate(self.y) + ]) + sample_weights_p = TensorList([ + fluid.layers.data( + '{}sample_weights_{}'.format(scope, idx), + shape=[None, 1], + stop_gradient=False, + append_batch_size=False) + for idx, v in enumerate(self.sample_weights) + ]) + self.inputs_dict[scope] = (training_samples_p, y_p, + sample_weights_p) + + return self.inputs_dict[scope] + + def get_feed_dict(self, scope=''): + if self.training_samples_stack is None or self.y_stack is None or self.sample_weights_stack is None: + self.training_samples_stack = self.training_samples.apply( + stack_input) + self.y_stack = self.y.apply(stack_input) + self.sample_weights_stack = self.sample_weights.apply(stack_input) + feed_dict = {} + for idx, v in enumerate(self.training_samples_stack): + feed_dict['{}training_samples_{}'.format(scope, idx)] = v + for idx, v in enumerate(self.y_stack): + feed_dict['{}y_{}'.format(scope, idx)] = v + for idx, v in enumerate(self.sample_weights_stack): + feed_dict['{}sample_weights_{}'.format(scope, idx)] = v + return feed_dict + + def __call__(self, x: TensorList, scope=''): + """ + Compute residuals + :param x: [filters, projection_matrices] + :return: [data_terms, filter_regularizations, proj_mat_regularizations] + """ + training_samples, y, samples_weights = self.get_inputs(scope) + + filter = x[:len(x) // 2] # w2 in paper + P = x[len(x) // 2:] # w1 in paper + + # Do first convolution + compressed_samples = operation.conv1x1( + training_samples, P).apply(self.projection_activation) + + # Do second convolution + residuals = operation.conv2d( + compressed_samples, filter, + mode='same').apply(self.response_activation) + + # Compute data residuals + residuals = residuals - y + + residuals = residuals * samples_weights.sqrt() + + # Add regularization for projection matrix + # TODO: remove static_identity + # for now, this is needed. Otherwise the gradient is None + residuals.extend( + filter.apply(static_identity) * self.filter_reg.apply(math.sqrt)) + + # Add regularization for projection matrix + residuals.extend( + P.apply(static_identity) * self.projection_reg.apply(math.sqrt)) + + return residuals + + def ip_input(self, a: TensorList, b: TensorList): + # return a.reshape(-1) @ b.reshape(-1) + num = len(a) // 2 # Number of filters + a_filter = a[:num] + b_filter = b[:num] + a_P = a[num:] + b_P = b[num:] + + # Filter inner product + ip_out = a_filter.reshape(-1) @b_filter.reshape(-1) + # ip_out = operation.conv2d(a_filter, b_filter).view(-1) + + # Add projection matrix part + ip_out += a_P.reshape(-1) @b_P.reshape(-1) + # ip_out += operation.conv2d(a_P.view(1, -1, 1, 1), b_P.view(1, -1, 1, 1)).view(-1) + + # Have independent inner products for each filter + return ip_out.concat(ip_out.clone()) + + def M1(self, x: TensorList): + return x / self.diag_M + + +class ConvProblem(optimization.L2Problem): + def __init__(self, + training_samples: TensorList, + y: TensorList, + filter_reg: TensorList, + sample_weights: TensorList, + response_activation): + self.training_samples = training_samples + self.y = y + self.filter_reg = filter_reg + self.sample_weights = sample_weights + self.response_activation = response_activation + + self.inputs_dict = {} + # stack tensors + self.training_samples_stack = None + self.y_stack = None + self.sample_weights_stack = None + + def get_feed_dict(self, scope=''): + if self.training_samples_stack is None or self.y_stack is None or self.sample_weights_stack is None: + self.training_samples_stack = self.training_samples.apply( + stack_input) + self.y_stack = self.y.apply(stack_input) + self.sample_weights_stack = self.sample_weights.apply(stack_input) + feed_dict = {} + for idx, v in enumerate(self.training_samples_stack): + feed_dict['{}training_samples_{}'.format(scope, idx)] = v + for idx, v in enumerate(self.y_stack): + feed_dict['{}y_{}'.format(scope, idx)] = v + for idx, v in enumerate(self.sample_weights_stack): + feed_dict['{}sample_weights_{}'.format(scope, idx)] = v + return feed_dict + + def get_inputs(self, scope=''): + if scope not in self.inputs_dict: + training_samples_p = TensorList([ + fluid.layers.data( + '{}training_samples_{}'.format(scope, idx), + shape=[None] + list(v[0].shape), + stop_gradient=False, + append_batch_size=False) + for idx, v in enumerate(self.training_samples) + ]) + y_p = TensorList([ + fluid.layers.data( + '{}y_{}'.format(scope, idx), + shape=[None] + list(v[0].shape), + stop_gradient=False, + append_batch_size=False) for idx, v in enumerate(self.y) + ]) + sample_weights_p = TensorList([ + fluid.layers.data( + '{}sample_weights_{}'.format(scope, idx), + shape=[None] + list(v[0].shape), + stop_gradient=False, + append_batch_size=False) + for idx, v in enumerate(self.sample_weights) + ]) + self.inputs_dict[scope] = (training_samples_p, y_p, + sample_weights_p) + + return self.inputs_dict[scope] + + def __call__(self, x: TensorList, scope=''): + """ + Compute residuals + :param x: [filters] + :return: [data_terms, filter_regularizations] + """ + training_samples, y, samples_weights = self.get_inputs(scope) + # Do convolution and compute residuals + residuals = operation.conv2d( + training_samples, x, mode='same').apply(self.response_activation) + residuals = residuals - y + + residuals = residuals * samples_weights.sqrt() + + # Add regularization for projection matrix + residuals.extend( + x.apply(static_identity) * self.filter_reg.apply(math.sqrt)) + + return residuals + + def ip_input(self, a: TensorList, b: TensorList): + return a.reshape(-1) @b.reshape(-1) + # return (a * b).sum() + # return operation.conv2d(a, b).view(-1) diff --git a/PaddleCV/tracking/pytracking/tracker/base/__init__.py b/PaddleCV/tracking/pytracking/tracker/base/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PaddleCV/tracking/pytracking/tracker/base/basetracker.py b/PaddleCV/tracking/pytracking/tracker/base/basetracker.py new file mode 100644 index 0000000000000000000000000000000000000000..77e473d2d77239f8471a43ca015fb717dfa0fdfe --- /dev/null +++ b/PaddleCV/tracking/pytracking/tracker/base/basetracker.py @@ -0,0 +1,286 @@ +import matplotlib + +matplotlib.use('TkAgg') +import matplotlib.pyplot as plt +import matplotlib.patches as patches +import cv2 as cv +import time +import os + + +class BaseTracker: + """Base class for all trackers.""" + + def __init__(self, params): + self.params = params + + def initialize(self, image, state, class_info=None): + """Overload this function in your tracker. This should initialize the model.""" + raise NotImplementedError + + def track(self, image): + """Overload this function in your tracker. This should track in the frame and update the model.""" + raise NotImplementedError + + def track_sequence(self, sequence): + """Run tracker on a sequence.""" + + # Initialize + image = self._read_image(sequence.frames[0]) + + times = [] + start_time = time.time() + self.initialize(image, sequence.init_state) + init_time = getattr(self, 'time', time.time() - start_time) + times.append(init_time) + + if self.params.visualization: + self.init_visualization() + self.visualize(image, sequence.init_state) + + # Track + tracked_bb = [sequence.init_state] + for frame in sequence.frames[1:]: + image = self._read_image(frame) + + start_time = time.time() + state = self.track(image) + times.append(time.time() - start_time) + + tracked_bb.append(state) + + if self.params.visualization: + self.visualize(image, state) + + return tracked_bb, times + + def track_videofile(self, videofilepath, optional_box=None): + """Run track with a video file input.""" + + assert os.path.isfile(videofilepath), "Invalid param {}".format( + videofilepath) + ", videofilepath must be a valid videofile" + + if hasattr(self, 'initialize_features'): + self.initialize_features() + + cap = cv.VideoCapture(videofilepath) + display_name = 'Display: ' + self.params.tracker_name + cv.namedWindow(display_name, cv.WINDOW_NORMAL | cv.WINDOW_KEEPRATIO) + cv.resizeWindow(display_name, 960, 720) + success, frame = cap.read() + cv.imshow(display_name, frame) + if success is not True: + print("Read frame from {} failed.".format(videofilepath)) + exit(-1) + if optional_box is not None: + assert isinstance(optional_box, list, tuple) + assert len(optional_box) == 4, "valid box's foramt is [x,y,w,h]" + self.initialize(frame, optional_box) + else: + while True: + # cv.waitKey() + frame_disp = frame.copy() + + cv.putText(frame_disp, 'Select target ROI and press ENTER', + (20, 30), cv.FONT_HERSHEY_COMPLEX_SMALL, 1.5, + (0, 0, 0), 1) + + x, y, w, h = cv.selectROI( + display_name, frame_disp, fromCenter=False) + init_state = [x, y, w, h] + self.initialize(frame, init_state) + break + + while True: + ret, frame = cap.read() + + if frame is None: + return + + frame_disp = frame.copy() + + # Draw box + state = self.track(frame) + state = [int(s) for s in state] + cv.rectangle(frame_disp, (state[0], state[1]), + (state[2] + state[0], state[3] + state[1]), + (0, 255, 0), 5) + + font_color = (0, 0, 0) + cv.putText(frame_disp, 'Tracking!', (20, 30), + cv.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) + cv.putText(frame_disp, 'Press r to reset', (20, 55), + cv.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) + cv.putText(frame_disp, 'Press q to quit', (20, 80), + cv.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) + + # Display the resulting frame + cv.imshow(display_name, frame_disp) + key = cv.waitKey(1) + if key == ord('q'): + break + elif key == ord('r'): + ret, frame = cap.read() + frame_disp = frame.copy() + + cv.putText(frame_disp, 'Select target ROI and press ENTER', + (20, 30), cv.FONT_HERSHEY_COMPLEX_SMALL, 1.5, + (0, 0, 0), 1) + + cv.imshow(display_name, frame_disp) + x, y, w, h = cv.selectROI( + display_name, frame_disp, fromCenter=False) + init_state = [x, y, w, h] + self.initialize(frame, init_state) + + # When everything done, release the capture + cap.release() + cv.destroyAllWindows() + + def track_webcam(self): + """Run tracker with webcam.""" + + class UIControl: + def __init__(self): + self.mode = 'init' # init, select, track + self.target_tl = (-1, -1) + self.target_br = (-1, -1) + self.mode_switch = False + + def mouse_callback(self, event, x, y, flags, param): + if event == cv.EVENT_LBUTTONDOWN and self.mode == 'init': + self.target_tl = (x, y) + self.target_br = (x, y) + self.mode = 'select' + self.mode_switch = True + elif event == cv.EVENT_MOUSEMOVE and self.mode == 'select': + self.target_br = (x, y) + elif event == cv.EVENT_LBUTTONDOWN and self.mode == 'select': + self.target_br = (x, y) + self.mode = 'track' + self.mode_switch = True + + def get_tl(self): + return self.target_tl if self.target_tl[0] < self.target_br[ + 0] else self.target_br + + def get_br(self): + return self.target_br if self.target_tl[0] < self.target_br[ + 0] else self.target_tl + + def get_bb(self): + tl = self.get_tl() + br = self.get_br() + + bb = [tl[0], tl[1], br[0] - tl[0], br[1] - tl[1]] + return bb + + ui_control = UIControl() + cap = cv.VideoCapture(0) + display_name = 'Display: ' + self.params.tracker_name + cv.namedWindow(display_name, cv.WINDOW_NORMAL | cv.WINDOW_KEEPRATIO) + cv.resizeWindow(display_name, 960, 720) + cv.setMouseCallback(display_name, ui_control.mouse_callback) + + if hasattr(self, 'initialize_features'): + self.initialize_features() + + while True: + # Capture frame-by-frame + ret, frame = cap.read() + frame_disp = frame.copy() + + if ui_control.mode == 'track' and ui_control.mode_switch: + ui_control.mode_switch = False + init_state = ui_control.get_bb() + self.initialize(frame, init_state) + + # Draw box + if ui_control.mode == 'select': + cv.rectangle(frame_disp, + ui_control.get_tl(), + ui_control.get_br(), (255, 0, 0), 2) + elif ui_control.mode == 'track': + state = self.track(frame) + state = [int(s) for s in state] + cv.rectangle(frame_disp, (state[0], state[1]), + (state[2] + state[0], state[3] + state[1]), + (0, 255, 0), 5) + + # Put text + font_color = (0, 0, 0) + if ui_control.mode == 'init' or ui_control.mode == 'select': + cv.putText(frame_disp, 'Select target', (20, 30), + cv.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) + cv.putText(frame_disp, 'Press q to quit', (20, 55), + cv.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) + elif ui_control.mode == 'track': + cv.putText(frame_disp, 'Tracking!', (20, 30), + cv.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) + cv.putText(frame_disp, 'Press r to reset', (20, 55), + cv.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) + cv.putText(frame_disp, 'Press q to quit', (20, 80), + cv.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1) + # Display the resulting frame + cv.imshow(display_name, frame_disp) + key = cv.waitKey(1) + if key == ord('q'): + break + elif key == ord('r'): + ui_control.mode = 'init' + + # When everything done, release the capture + cap.release() + cv.destroyAllWindows() + + def reset_tracker(self): + pass + + def press(self, event): + if event.key == 'p': + self.pause_mode = not self.pause_mode + print("Switching pause mode!") + elif event.key == 'r': + self.reset_tracker() + print("Resetting target pos to gt!") + + def init_visualization(self): + # plt.ion() + self.pause_mode = False + self.fig, self.ax = plt.subplots(1) + self.fig.canvas.mpl_connect('key_press_event', self.press) + plt.tight_layout() + + def visualize(self, image, state): + self.ax.cla() + self.ax.imshow(image) + rect = patches.Rectangle( + (state[0], state[1]), + state[2], + state[3], + linewidth=1, + edgecolor='r', + facecolor='none') + self.ax.add_patch(rect) + + if hasattr(self, 'gt_state') and False: + gt_state = self.gt_state + rect = patches.Rectangle( + (gt_state[0], gt_state[1]), + gt_state[2], + gt_state[3], + linewidth=1, + edgecolor='g', + facecolor='none') + self.ax.add_patch(rect) + self.ax.set_axis_off() + self.ax.axis('equal') + plt.draw() + plt.pause(0.001) + + if self.pause_mode: + plt.waitforbuttonpress() + + def _read_image(self, image_file: str): + return cv.cvtColor(cv.imread(image_file), cv.COLOR_BGR2RGB) diff --git a/PaddleCV/tracking/pytracking/tracker/siamfc/__init__.py b/PaddleCV/tracking/pytracking/tracker/siamfc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..91cf4214c152ea9863259998d79ab9b313e64338 --- /dev/null +++ b/PaddleCV/tracking/pytracking/tracker/siamfc/__init__.py @@ -0,0 +1,5 @@ +from .siamfc import SiamFC + + +def get_tracker_class(): + return SiamFC diff --git a/PaddleCV/tracking/pytracking/tracker/siamfc/eval_siamfc_otb.py b/PaddleCV/tracking/pytracking/tracker/siamfc/eval_siamfc_otb.py new file mode 100644 index 0000000000000000000000000000000000000000..d71b1f10009be2bfda952e2c1832965b9c76cd2a --- /dev/null +++ b/PaddleCV/tracking/pytracking/tracker/siamfc/eval_siamfc_otb.py @@ -0,0 +1,170 @@ +import os +import numpy as np + +from PIL import Image + +import os.path as osp +import sys +CURRENT_DIR = osp.dirname(__file__) +sys.path.append(osp.join(CURRENT_DIR, '..', '..', '..')) + +from pytracking.pysot_toolkit.utils import success_overlap, success_error +import json +from pytracking.tracker.siamfc.siamfc import SiamFC + +from tqdm import tqdm + +from pytracking.parameter.siamfc.default import parameters + + +class ValidOTB(SiamFC): + def __init__(self, dataset_root, dataset_name, params): + super(ValidOTB, self).__init__(params) + """ + dataset_root: the root directory of dataset + dataset_name: the name of OTB dataste, [CVPR2013, OTB50, OTB100] + """ + self.params = self.params + self.root_path = dataset_root + if not os.path.exists(self.root_path): + raise Exception("'{}' does not exists".format(self.root_path)) + + dataset_list = ['CVPR13', 'OTB2013', 'OTB100', 'OTB50'] + if dataset_name not in dataset_list: + raise Exception("ValidOTB's dataset_name can only be one of {}". + format(dataset_list)) + if dataset_name == 'OTB2013': + dataset_name = 'CVPR13' + self.dataset_name = dataset_name + self.otb2013_json = os.path.join(self.root_path, dataset_name + '.json') + + self.meta_data = json.load(open(self.otb2013_json, 'rb')) + self.video_name = list(self.meta_data.keys()) + + def inference(self, epoch): + + gtbb = [] + prebb = [] + """ add save dir """ + save_dir = "./eval_otb13/epoch_" + str(epoch) + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + # load videos + pbar = tqdm( + self.meta_data.keys(), + desc='loading ' + self.dataset_name, + ncols=100) + for idx, vid in enumerate(pbar): + pbar.set_postfix_str(vid) + + gt_boxs = self.meta_data[vid]['gt_rect'] + start_frame, end_frame = 0, len(gt_boxs) + img_list = self.meta_data[vid]['img_names'] + assert len(img_list) == len(gt_boxs) + + gt_box_list = [] + pre_box_list = [] + for i in range(start_frame, end_frame): + img = Image.open(os.path.join(self.root_path, img_list[i])) + if len(img.size) < 3 or img.size[-1] == 1: + img = img.convert('RGB') + + gt_box = gt_boxs[i - start_frame] + + if i == start_frame: + self.initialize(image=img, state=gt_box) + pre_box_list.append(gt_box) + gt_box_list.append(gt_box) + continue + else: + pre_box = self.track(img) + + pre_box_list.append(list(pre_box)) + gt_box_list.append(gt_box) + + gtbb += gt_box_list + prebb += pre_box_list + """ add save_dir""" + vid_save_dir = os.path.join(save_dir, vid + '.txt') + with open(vid_save_dir, 'w') as f: + outputs = [] + for res in pre_box_list: + outputs.append('{},{},{},{}'.format(res[0], res[1], res[2], + res[3])) + f.write('\n'.join(outputs)) + + auc = success_overlap(np.array(gtbb), np.array(prebb), len(gtbb)) + + thresholds = np.arange(0, 51, 1) + gt_center = self.convert_bb_to_center(np.array(gtbb)) + tracker_center = self.convert_bb_to_center(np.array(prebb)) + precision = success_error( + np.array(gt_center), + np.array(tracker_center), thresholds, len(gtbb)) + print("####AUC:{}, Precision:{}".format( + np.mean(auc), np.mean(precision))) + + return np.mean(auc), np.mean(precision) + + def convert_bb_to_center(self, bboxes): + return np.array([(bboxes[:, 0] + (bboxes[:, 2] - 1) / 2), + (bboxes[:, 1] + (bboxes[:, 3] - 1) / 2)]).T + + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument( + '--checkpoint', + type=str, + default="./checkpoint/", + help="the path of saved siamfc params file") +parser.add_argument( + '--dataset_dir', + type=str, + default="/paddle/Datasets/OTB100", + help="the path of OTB dataset") +parser.add_argument( + '--dataset_name', + type=str, + default="CVPR13", + help="can only be one of [CVPR13, OTB2013, OTB50, OTB100]") + +parser.add_argument( + '--start_epoch', + type=int, + default=1, + help="evaluate from start_epoch epoch, greater than 1") +parser.add_argument( + '--end_epoch', + type=int, + default=50, + help="evaluate ends at end_epoch epoch, smaller than 50 ") + +args = parser.parse_args() + +if __name__ == '__main__': + + params = parameters() + params.net_path = args.checkpoint + start_epoch = args.start_epoch + end_epoch = args.end_epoch + + assert start_epoch >= 1 and end_epoch <= 50 and start_epoch < end_epoch + + best_auc, best_epoch = 0, start_epoch + + for i in range(start_epoch, end_epoch, 1): + params.net_path = os.path.join(args.checkpoint, "SiamNet_ep%004d" % i) + valid = ValidOTB( + dataset_root=args.dataset_dir, + dataset_name=args.dataset_name, + params=params) + + auc, precision = valid.inference(epoch=i) + + if auc > best_auc: + best_auc = auc + best_epoch = i + print("####Best AUC is {}, corresponding epoch is {}".format( + best_auc, best_epoch)) diff --git a/PaddleCV/tracking/pytracking/tracker/siamfc/eval_siamfc_vot.py b/PaddleCV/tracking/pytracking/tracker/siamfc/eval_siamfc_vot.py new file mode 100644 index 0000000000000000000000000000000000000000..e12448cf17121c80c6c72a7e86f0b2dbf50e7e1b --- /dev/null +++ b/PaddleCV/tracking/pytracking/tracker/siamfc/eval_siamfc_vot.py @@ -0,0 +1,258 @@ +import os +import numpy as np + +from PIL import Image + +import os.path as osp +import sys +CURRENT_DIR = osp.dirname(__file__) +sys.path.append(osp.join(CURRENT_DIR, '..', '..', '..')) + +import json +from pytracking.tracker.siamfc.siamfc import SiamFC + +from tqdm import tqdm + +from pytracking.parameter.siamfc.default import parameters + + +class ValidVOT(SiamFC): + def __init__(self, dataset_root, dataset_name, params): + super(ValidVOT, self).__init__(params) + """ + dataset_root: the root directory of dataset + dataset_name: the name of VOT dataste, [VOt2015, VOT2018, ...] + """ + self.params = self.params + self.root_path = dataset_root + if not os.path.exists(self.root_path): + raise Exception("'{}' does not exists".format(self.root_path)) + + dataset_list = ['VOT2015', 'VOT2018'] + if dataset_name not in dataset_list: + raise Exception("ValidVOT's dataset_name can only be one of {}". + format(dataset_list)) + + self.dataset_name = dataset_name + self.vot2013_json = os.path.join(self.root_path, dataset_name + '.json') + # self.otb2013_json = "/paddle/Datasets/OTB100/CVPR13.json" + + self.meta_data = json.load(open(self.vot2013_json, 'rb')) + self.video_name = list(self.meta_data.keys()) + + def inference_reinit(self, epoch, start_frame=0): + + # video-wised + vid_num = len(self.video_name) + vid_ious = np.zeros(vid_num) + vid_length = np.zeros(vid_num) + fail_num = np.zeros(vid_num) + + burn_in_period = 5 + pbar = tqdm( + self.meta_data.keys(), + desc='loading ' + self.dataset_name, + ncols=100) + + for idx, vid in enumerate(pbar): + pbar.set_postfix_str(vid) + + gt_boxs = self.meta_data[vid]['gt_rect'] + img_list = self.meta_data[vid]['img_names'] + imgs_num = len(img_list) + + gt_box_list = [] + pre_box_list = [] + + valid_frames_num = imgs_num - start_frame + step = start_frame + reinit = True + re_init_frame = step + while step < imgs_num: + img = Image.open(os.path.join(self.root_path, img_list[step])) + + gt_box = list(map(float, self.region_to_bbox(gt_boxs[step]))) + + if reinit: + # the tracker was initialized + # five frames after the failure + self.initialize(img, gt_box) + reinit = False + # print("reinit, vid: {}, step: {}, failnum: {}".format(vid, step, fail_num[idx])) + continue + + pre_box = self.track(img) + if step - re_init_frame < 10: + # burn in period + step += 1 + valid_frames_num -= 1 + continue + + pre_box_list.append(list(pre_box)) + gt_box_list.append(gt_box) + + iou = self._compute_iou(pre_box, gt_box) + vid_ious[idx] += iou + + if iou == 0.: + reinit = True + + fail_num[idx] += 1 + # the tracker was initialized + # five frames after the failure + step += burn_in_period + re_init_frame = step + valid_frames_num -= burn_in_period + step += 1 + + vid_length[idx] = valid_frames_num + #print("idx: {}, vid: {}, failure: {}, miou: {}\n".format(idx, vid, fail_num[idx], + # vid_ious[idx]/valid_frames_num)) + + acc = np.sum(vid_ious) / np.sum(vid_length) + print("##########Evaluation##########") + print("##acc = {}".format(acc)) + print("##failure = {}".format(np.sum(fail_num))) + + return acc, np.sum(fail_num) + + def _compute_iou(self, box1, box2): + """ + computing IoU + print("acc shape", acc.shape, "vid_length shape: ", vid_length.shape) + print("acc shape", acc.shape, "vid_length shape: ", vid_length.shape) + :param rec1: (x0, y0, w, h), which reflects + (top, left, bottom, right) + :param rec2: (x0, y0, w, h) + :return: scala value of IoU + """ + rec1 = box1 + rec2 = box2 + # computing area of each rectangles + S_rec1 = (rec1[2] + 1) * (rec1[3] + 1) + S_rec2 = (rec2[2] + 1) * (rec2[3] + 1) + + # computing the sum_area + sum_area = S_rec1 + S_rec2 + + # find the each edge of intersect rectangle + left_line = max(rec1[1], rec2[1]) + right_line = min(rec1[3] + rec1[1], rec2[3] + rec2[1]) + top_line = max(rec1[0], rec2[0]) + bottom_line = min(rec1[2] + rec2[0], rec2[2] + rec2[0]) + + # judge if there is an intersect + if left_line >= right_line or top_line >= bottom_line: + iou = 0. + else: + intersect = (right_line - left_line + 1) * ( + bottom_line - top_line + 1) + iou = (intersect / (sum_area - intersect)) * 1.0 + assert iou >= 0 + assert iou <= 1.01 + return iou + + def region_to_bbox(self, region, center=False): + + n = len(region) + region = np.array(region) + assert n == 4 or n == 8, ( + 'GT region format is invalid, should have 4 or 8 entries.') + + # we assume the grountruth bounding boxes are saved with 0-indexing + def _rect(region, center): + + if center: + x = region[0] + y = region[1] + w = region[2] + h = region[3] + cx = x + w / 2 + cy = y + h / 2 + return cx, cy, w, h + else: + region[0] -= 1 + region[1] -= 1 + return region + + def _poly(region, center): + cx = np.mean(region[::2]) + cy = np.mean(region[1::2]) + x1 = np.min(region[::2]) + x2 = np.max(region[::2]) + y1 = np.min(region[1::2]) + y2 = np.max(region[1::2]) + A1 = np.linalg.norm(region[0:2] - region[2:4]) * np.linalg.norm( + region[2:4] - region[4:6]) + A2 = (x2 - x1) * (y2 - y1) + s = np.sqrt(A1 / A2) + w = s * (x2 - x1) + 1 + h = s * (y2 - y1) + 1 + + if center: + return cx, cy, w, h + else: + return cx - w / 2, cy - h / 2, w, h + + if n == 4: + return _rect(region, center) + else: + return _poly(region, center) + + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument( + '--checkpoint', + type=str, + default="./checkpoint/", + help="the path of saved siamfc params file") +parser.add_argument( + '--dataset_dir', + type=str, + default="/paddle/Datasets/VOT2015", + help="the path of VOT dataset") +parser.add_argument( + '--dataset_name', + type=str, + default="VOT2015", + help="can only be one of [VOT2015, VOT2018]") + +parser.add_argument( + '--start_epoch', + type=int, + default=1, + help="evaluate from start_epoch epoch, greater than 1") +parser.add_argument( + '--end_epoch', + type=int, + default=50, + help="evaluate ends at end_epoch epoch, smaller than 50 ") + +args = parser.parse_args() + +if __name__ == '__main__': + + params = parameters() + params.net_path = args.checkpoint + start_epoch = args.start_epoch + end_epoch = args.end_epoch + + assert start_epoch >= 1 and end_epoch <= 50 and start_epoch < end_epoch + + best_acc, best_failure, best_epoch = 0, 100, start_epoch + + for i in range(start_epoch, end_epoch, 2): + params.net_path = os.path.join(args.checkpoint, "SiamNet_ep%004d" % i) + valid = ValidVOT( + dataset_root=args.dataset_dir, + dataset_name=args.dataset_name, + params=params) + + acc, failure = valid.inference_reinit(epoch=i) + print("####Epoch: {}, ACC: {}, Failure: {}".format(i, acc, failure)) + if acc > best_acc and failure <= 84: + best_acc = acc + best_epoch = i + print("####Best ACC: {}, Failure: {}, corresponding epoch: {}". + format(best_acc, failure, best_epoch)) diff --git a/PaddleCV/tracking/pytracking/tracker/siamfc/siamfc.py b/PaddleCV/tracking/pytracking/tracker/siamfc/siamfc.py new file mode 100644 index 0000000000000000000000000000000000000000..704fafe389e1607f843af20fae2590819b4c21d5 --- /dev/null +++ b/PaddleCV/tracking/pytracking/tracker/siamfc/siamfc.py @@ -0,0 +1,208 @@ +import time +import numpy as np + +import paddle.fluid as fluid +from paddle.fluid import dygraph + +from pytracking.tracker.base.basetracker import BaseTracker + +from ltr.models.siamese.siam import siamfc_alexnet + +import cv2 +# for debug +from pytracking.parameter.siamfc.default import parameters + + +class SiamFC(BaseTracker): + def __init__(self, params=parameters()): + + self.params = params + self.model_initializer() + + def initialize_features(self): + if not getattr(self, 'features_initialized', False): + self.params.features.initialize() + self.features_initialized = True + + def model_initializer(self): + import os + net_path = self.params.net_path + if net_path is None: + net_path = self.params.features.features[0].net_path + if not os.path.exists(net_path): + raise Exception("not found {}".format(net_path)) + with dygraph.guard(): + self.model = siamfc_alexnet(backbone_is_test=True) + #state_dict, _ = fluid.load_dygraph(net_path) + weight_params, opt_params = fluid.load_dygraph(net_path) + state_dict = self.model.state_dict() + for k1, k2 in zip(state_dict.keys(), weight_params.keys()): + if list(state_dict[k1].shape) == list(weight_params[k2].shape): + state_dict[k1].set_value(weight_params[k2]) + else: + raise Exception("ERROR, shape not match") + self.model.load_dict(state_dict) + self.model.eval() + + def _cosine_window(self, size): + """ + get the cosine window + """ + cos_window = np.hanning(int(size[0]))[:, np.newaxis].dot( + np.hanning(int(size[1]))[np.newaxis, :]) + cos_window = cos_window.astype(np.float32) + cos_window /= np.sum(cos_window) + return cos_window + + def initialize(self, image, state, *args, **kwargs): + # state (x, y, w, h) + # Initialize some stuff + self.frame_num = 1 + self.time = 0 + + # Get position and size + box = state + image = np.asarray(image) + # convert box to 0-indexed and center based [y, x, h, w] + box = np.array( + [ + box[1] - 1 + (box[3] - 1) / 2, box[0] - 1 + (box[2] - 1) / 2, + box[3], box[2] + ], + dtype=np.float32) + self.center, self.target_sz = box[:2], box[2:] + + # create hanning window + self.upscale_sz = self.params.response_up * self.params.response_sz + self.hann_window = np.outer( + np.hanning(self.upscale_sz), np.hanning(self.upscale_sz)) + self.hann_window /= self.hann_window.sum() + + # search scale factors + self.scale_factors = self.params.scale_step**np.linspace( + -(self.params.scale_num // 2), self.params.scale_num // 2, + self.params.scale_num) + + # exemplar and search sizes + context = self.params.context * np.sum(self.target_sz) + self.z_sz = np.sqrt(np.prod(self.target_sz + context)) + self.x_sz = self.z_sz * \ + self.params.instance_sz / self.params.exemplar_sz + + # exemplar image + self.avg_color = np.mean(image, axis=(0, 1)) + exemplar_image = self._crop_and_resize( + image, + self.center, + self.z_sz, + out_size=self.params.exemplar_sz, + pad_color=self.avg_color) + self.exemplar_img_1s = exemplar_image[np.newaxis, :, :, :] + self.exemplar_img = np.transpose(self.exemplar_img_1s, + [0, 3, 1, 2]).astype(np.float32) + self.exemplar_img = np.repeat( + self.exemplar_img, self.params.scale_num, axis=0) + + def _crop_and_resize(self, image, center, size, out_size, pad_color): + # convert box to corners (0-indexed) + size = round(size) + corners = np.concatenate((np.round(center - (size - 1) / 2), + np.round(center - (size - 1) / 2) + size)) + corners = np.round(corners).astype(int) + + # pad image if necessary + pads = np.concatenate((-corners[:2], corners[2:] - image.shape[:2])) + npad = max(0, int(pads.max())) + if npad > 0: + image = cv2.copyMakeBorder( + image, + npad, + npad, + npad, + npad, + cv2.BORDER_CONSTANT, + value=pad_color) + + # crop image patch + corners = (corners + npad).astype(int) + patch = image[corners[0]:corners[2], corners[1]:corners[3]] + + # resize to out_size + patch = cv2.resize(patch, (out_size, out_size)) + + return patch + + def track(self, image): + #print("## track, input image shape:", image.shape) + self.frame_num += 1 + + image = np.asarray(image) + # search images + instance_images = [ + self._crop_and_resize( + image, + self.center, + self.x_sz * f, + out_size=self.params.instance_sz, + pad_color=self.avg_color) for f in self.scale_factors + ] + instance_images = np.stack(instance_images, axis=0) + instance_images = np.transpose(instance_images, + [0, 3, 1, 2]).astype(np.float32) + + # calculate response + # exemplar features + with fluid.dygraph.guard(): + instance_images = fluid.dygraph.to_variable(instance_images) + self.exemplar_img = fluid.dygraph.to_variable(self.exemplar_img) + responses = self.model(self.exemplar_img, instance_images) + + responses = responses.numpy() + + responses = np.squeeze(responses, axis=1) + # upsample responses and penalize scale changes + responses = np.stack( + [ + cv2.resize( + t, (self.upscale_sz, self.upscale_sz), + interpolation=cv2.INTER_CUBIC) for t in responses + ], + axis=0) + responses[:self.params.scale_num // 2] *= self.params.scale_penalty + responses[self.params.scale_num // 2 + 1:] *= self.params.scale_penalty + + # peak scale + scale_list = np.amax(responses, axis=(1, 2)) + scale_id = np.argmax(scale_list) + #scale_id = np.argmax(np.amax(responses, axis=(1, 2))) + # peak location + response = responses[scale_id] + response -= response.min() + response /= response.sum() + 1e-16 + response = (1 - self.params.window_influence) * response + \ + self.params.window_influence * self.hann_window + loc = np.unravel_index(response.argmax(), response.shape) + + # locate target center + disp_in_response = np.array(loc) - (self.upscale_sz - 1.) / 2 + disp_in_instance = disp_in_response * \ + self.params.total_stride / self.params.response_up + disp_in_image = disp_in_instance * self.x_sz * \ + self.scale_factors[scale_id] / self.params.instance_sz + self.center += disp_in_image + + # update target size + scale = (1 - self.params.scale_lr) * 1.0 + \ + self.params.scale_lr * self.scale_factors[scale_id] + self.target_sz *= scale + self.z_sz *= scale + self.x_sz *= scale + + # return 1-indexed and left-top based bounding box + box = np.array([ + self.center[1] + 1 - (self.target_sz[1] - 1) / 2, + self.center[0] + 1 - (self.target_sz[0] - 1) / 2, self.target_sz[1], + self.target_sz[0] + ]) + + return box diff --git a/PaddleCV/tracking/pytracking/utils/__init__.py b/PaddleCV/tracking/pytracking/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c99c377521f541c01de31c6e02813d362ff4d7c4 --- /dev/null +++ b/PaddleCV/tracking/pytracking/utils/__init__.py @@ -0,0 +1,2 @@ +# from .evaluation import * +from .params import * diff --git a/PaddleCV/tracking/pytracking/utils/params.py b/PaddleCV/tracking/pytracking/utils/params.py new file mode 100644 index 0000000000000000000000000000000000000000..bcaf3d6831366dfb7d91584cf6ed488846261efd --- /dev/null +++ b/PaddleCV/tracking/pytracking/utils/params.py @@ -0,0 +1,31 @@ +from pytracking.libs import TensorList +import random + + +class TrackerParams: + """Class for tracker parameters.""" + + def free_memory(self): + for a in dir(self): + if not a.startswith('__') and hasattr( + getattr(self, a), 'free_memory'): + getattr(self, a).free_memory() + + +class FeatureParams: + """Class for feature specific parameters""" + + def __init__(self, *args, **kwargs): + if len(args) > 0: + raise ValueError + + for name, val in kwargs.items(): + if isinstance(val, list): + setattr(self, name, TensorList(val)) + else: + setattr(self, name, val) + + +def Choice(*args): + """Can be used to sample random parameter values.""" + return random.choice(args) diff --git a/PaddleCV/tracking/pytracking/utils/plotting.py b/PaddleCV/tracking/pytracking/utils/plotting.py new file mode 100644 index 0000000000000000000000000000000000000000..ef95655b47a35bf186939006b356986847ba3971 --- /dev/null +++ b/PaddleCV/tracking/pytracking/utils/plotting.py @@ -0,0 +1,50 @@ +import matplotlib +matplotlib.use('TkAgg') +import matplotlib.pyplot as plt +import numpy as np +from pytracking.libs.paddle_utils import p2n, PTensor + + +def save_tensor(a: PTensor, save_name): + a_np = p2n(a) + np.save(save_name, a_np) + + +def show_tensor(a: PTensor, fig_num=None, title=None): + """Display a 2D tensor. + args: + fig_num: Figure number. + title: Title of figure. + """ + a_np = a.squeeze().cpu().clone().detach().numpy() + if a_np.ndim == 3: + a_np = np.transpose(a_np, (1, 2, 0)) + plt.figure(fig_num) + plt.tight_layout() + plt.cla() + plt.imshow(a_np) + plt.axis('off') + plt.axis('equal') + if title is not None: + plt.title(title) + plt.draw() + plt.pause(0.001) + + +def plot_graph(a: PTensor, fig_num=None, title=None): + """Plot graph. Data is a 1D tensor. + args: + fig_num: Figure number. + title: Title of figure. + """ + a_np = a.squeeze().cpu().clone().detach().numpy() + if a_np.ndim > 1: + raise ValueError + plt.figure(fig_num) + # plt.tight_layout() + plt.cla() + plt.plot(a_np) + if title is not None: + plt.title(title) + plt.draw() + plt.pause(0.001) diff --git a/PaddleCV/tracking/pytracking/visualize_results_on_benchmark.ipynb b/PaddleCV/tracking/pytracking/visualize_results_on_benchmark.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..3207694a14580dbee99284492724d7d32ce82edc --- /dev/null +++ b/PaddleCV/tracking/pytracking/visualize_results_on_benchmark.ipynb @@ -0,0 +1,176 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import cv2 as cv\n", + "import numpy as np\n", + "from matplotlib.pyplot import Rectangle\n", + "from videofig import videofig\n", + "\n", + "sys.path.append('..')\n", + "from pytracking.pysot_toolkit.datasets import DatasetFactory\n", + "from pytracking.pysot_toolkit.environment import env_settings\n", + "\n", + "# set the dataset name here\n", + "dataset_name = 'CVPR13'\n", + "\n", + "if dataset_name in ['CVPR13', 'OTB50', 'OTB100']:\n", + " # for OTB datasets, we save results into the same directory\n", + " save_dataset_name = 'OTB100'\n", + "else:\n", + " save_dataset_name = dataset_name\n", + "\n", + "dataset_root = os.path.join(env_settings().dataset_path, save_dataset_name)\n", + "\n", + "# load dataset\n", + "dataset = DatasetFactory.create_dataset(name=dataset_name, dataset_root=dataset_root, load_img=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset.videos.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Select results to show" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tracker_test_params = 'siamfc.default'\n", + "exp_id = 'siamfc.siamfc_alexnet_vid.epoch49'\n", + "videoname = 'Bolt'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Show" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib notebook\n", + "\n", + "if 'OTB100' == save_dataset_name:\n", + " filename = '{}.txt'.format(videoname)\n", + "elif 'VOT' in save_dataset_name:\n", + " filename = 'baseline/{vname}/{vname}_001.txt'.format(vname=videoname)\n", + "else:\n", + " raise NotImplemented\n", + " \n", + "video = dataset[videoname]\n", + "\n", + "# load tracking results\n", + "boxs = []\n", + "with open(os.path.join(env_settings().results_path, save_dataset_name, tracker_test_params, exp_id, filename), 'r') as file_handle:\n", + " for line in file_handle:\n", + " boxs.append([float(v) for v in line.strip().split(',')])\n", + "\n", + "def redraw_fn(f, ax):\n", + " img_path, _ = video[f]\n", + " img = cv.cvtColor(cv.imread(img_path), cv.COLOR_BGR2RGB)\n", + " \n", + " box = boxs[f]\n", + " if len(box) == 4:\n", + " x, y, w, h = box\n", + " else:\n", + " x, y, w, h = 0, 0, 0, 0\n", + " \n", + " if not redraw_fn.initialized:\n", + " redraw_fn.img_handle = ax.imshow(img)\n", + " box_artist = Rectangle((x, y), w, h,\n", + " fill=False, # remove background\n", + " lw=2,\n", + " edgecolor=\"red\")\n", + " ax.add_patch(box_artist)\n", + " redraw_fn.box_handle = box_artist\n", + " redraw_fn.text_handle = ax.text(0., 1 - 0.05,\n", + " 'Frame: {}'.format(f + 1),\n", + " transform=ax.transAxes,\n", + " color='yellow', size=12)\n", + " redraw_fn.initialized = True\n", + " else:\n", + " redraw_fn.img_handle.set_array(img)\n", + " redraw_fn.box_handle.set_xy((x, y))\n", + " redraw_fn.box_handle.set_width(w)\n", + " redraw_fn.box_handle.set_height(h)\n", + " redraw_fn.text_handle.set_text('Frame: {}'.format(f + 1))\n", + "\n", + "redraw_fn.initialized = False\n", + "\n", + "videofig(len(video), redraw_fn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/PaddleCV/tracking/requirements.txt b/PaddleCV/tracking/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d4c2c330feb049481311ec4f2098107f0769e4a --- /dev/null +++ b/PaddleCV/tracking/requirements.txt @@ -0,0 +1,11 @@ +git+https://github.com/tensorpack/dataflow.git +cython +pycocotools +lmdb +pandas +jpeg4py +opencv-python +tensorboardX +videofig +jupyter +tqdm