diff --git a/PaddleCV/tracking/.gitmodules b/PaddleCV/tracking/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..236705c5a2a142eff9cd15e25d6f6e2531783799
--- /dev/null
+++ b/PaddleCV/tracking/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "pytracking/pysot-toolkit"]
+ path = pytracking/pysot-toolkit
+ url = https://github.com/StrangerZhang/pysot-toolkit.git
diff --git a/PaddleCV/tracking/LICENSE b/PaddleCV/tracking/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..f288702d2fa16d3cdf0035b15a9fcbc552cd88e7
--- /dev/null
+++ b/PaddleCV/tracking/LICENSE
@@ -0,0 +1,674 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc.
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+
+ Copyright (C)
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ Copyright (C)
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+.
diff --git a/PaddleCV/tracking/README.md b/PaddleCV/tracking/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e27ecb11f465e0438dae7623c3d5f6c96696e037
--- /dev/null
+++ b/PaddleCV/tracking/README.md
@@ -0,0 +1,309 @@
+# tracking 单目标跟踪框架
+
+## 介绍
+
+tracking 是基于百度深度学习框架Paddle研发的视频单目标跟踪(Visual Object Tracking, VOT)库, 整体框架参考 [pytracking](https://github.com/visionml/pytracking),其优秀的设计使得我们能够方便地将其他跟踪器如SiamFC,SiamRPN,SiamMask等融合到一个框架中,方便后续统一的实验和比较。
+
+当前tracking涵盖当前目标跟踪的主流模型,包括SiamFC, SiamRPN, SiamMask, ATOM。tracking旨在给开发者提供一系列基于PaddlePaddle的便捷、高效的目标跟踪深度学习算法,后续会不断的扩展模型的丰富度。
+
+ATOM 跟踪效果展示:
+
+
+
+图中,绿色框为标注的bbox,红色框为ATOM跟踪的bbox。
+
+## 代码目录结构
+
+
+```
+imgs 包含跟踪结果的图像
+
+ltr 包含模型训练代码
+ └─ actors 输入数据,输出优化目标
+ └─ admin 管理数据路径等
+ └─ data 多线程数据读取和预处理
+ └─ dataset 训练数据集读取
+ └─ models 模型定义
+ └─ train_settings 训练配置
+ └─ trainers 模型训练器
+ └─ run_training.py 模型训练入口程序
+
+pytracking 包含跟踪代码
+ └─ admin 管理数据路径,模型位置等
+ └─ features 特征提取
+ └─ libs 跟踪常用操作
+ └─ parameter 跟踪器参数设置
+ └─ tracker 跟踪器
+ └─ utils 画图等
+ └─ pysot-toolkit 评估数据集载入和指标计算
+ └─ eval_benchmark.py 评估跟踪器入口程序
+ └─ visualize_results_on_benchmark.ipynb 可视化跟踪结果
+```
+
+## 开始使用
+
+### 数据准备
+
+目标跟踪的训练集和测试集是不同的,目前最好的模型往往是使用多个训练集进行训练。
+
+主流的训练数据集有:
+- [VID](http://bvisionweb1.cs.unc.edu/ilsvrc2015/ILSVRC2015_VID.tar.gz)
+- [Microsoft COCO 2014](http://cocodataset.org/#download)
+- [LaSOT](https://drive.google.com/file/d/1O2DLxPP8M4Pn4-XCttCJUW3A29tDIeNa/view)
+- [GOT-10K](http://got-10k.aitestunion.com/downloads_dataset/full_data)
+
+下载并解压后的数据集的组织方式为:
+```
+/Datasets/
+ └─ ILSVRC2015_VID/
+ └─ train2014/
+ └─ GOT-10K/
+ └─ LaSOTBenchmark/
+
+```
+Datasets是数据集保存的路径。
+
+注:数据集较大,请预留足够的磁盘空间。训练Siamfc时,只需要下载VID数据集,训练ATOM需要全部下载上述三个数据集。
+
+
+## 快速开始
+
+tracking的工作环境:
+- python3
+- PaddlePaddle1.7
+
+> 注意:如果遇到cmath无法import的问题,建议切换Python版本,建议使用python3.6.8, python3.7.0
+
+### 安装依赖
+
+1. 安装paddle,需要安装1.7版本的Paddle,如低于这个版本,请升级到Paddle 1.7.
+```bash
+pip install paddlepaddle-gpu==1.7.0
+```
+
+2. 安装第三方库,建议使用anaconda
+```bash
+# (可选) 0. 强烈建议新建一个 conda 环境,在安装 anaconda 后执行
+# conda create -n paddle1.7-py3.6 python=3.6
+# conda activate paddle1.7-py3.6
+
+cd tracking
+pip install -r requirements.txt
+
+# (可选) 1. 推荐安装:快速读取 jpeg 文件
+apt-get install libturbojpeg
+
+# (可选) 2. 推荐安装:进程控制
+apt-get install build-essential libcap-dev
+pip install python-prctl
+```
+
+
+
+### 预训练 backbone 下载
+
+在开始训练前,先准备SiamRPN、SiamMask、ATOM模型的Backbone预训练模型。
+
+我们提供 ATOM ResNet18 和 ResNet50 的 backbone模型。可从[这里](https://paddlemodels.bj.bcebos.com/paddle_track/vot/pretrained_models.tar)下载所有预训练模型的压缩包。
+压缩包解压后的文件夹为 `pretrained_models`. 文件的目录结构如下:
+```
+/pretrained_models/
+ └─ atom
+ └─ atom_resnet18.pdparams
+ └─ atom_resnet50.pdparams
+ └─ backbone
+ └─ ResNet18.pdparams
+ └─ ResNet50.pdparams
+```
+其中/pretrained_models/backbone/文件夹包含,ResNet18、ResNet50在Imagenet上的预训练模型。
+
+
+### 设置训练参数
+
+在启动训练前,需要设置tracking使用的数据集路径,以及训练模型保存的路径,这些参数在ltr/admin/local.py中设置。
+
+首先,需要先生成local.py文件。
+
+```bash
+# 到代码库根目录
+cd tracking
+
+```
+其次,设置训练模型文件保存路径:workspace_dir,backbone模型路径:backbone_dir,数据集路径等等,对于没有用到的数据集,可以不用设置其路径。
+```
+# 用你常用的编辑器编辑 ltr/admin/local.py
+# 比方说,vim ltr/admin/local.py
+# 其中,
+# workspace_dir = './checkpoints' # 要保存训练模型的位置
+# backbone_dir = Your BACKBONE_PATH # 训练SiamFC时不需要设置
+# 并依次设定需要使用的训练数据集如 VID, LaSOT, COCO 等,比如:
+# imagenet_dir = '/Datasets/ILSVRC2015/' # 设置训练集VID的路径
+
+# 如果 ltr/admin/local.py 不存在,请使用代码生成
+python -c "from ltr.admin.environment import create_default_local_file; create_default_local_file()"
+```
+
+训练SiamFC时需要只需要配置 workspace_dir和 imagenet_dir即可,如下:
+```bash
+ self.workspace_dir = './checkpoints'
+ self.imagenet_dir = '/Datasets/ILSVRC2015/'
+```
+训练ATOM时,除了 workspace_dir和 imagenet_dir外,还需要指定coco, lasot, got10k的数据集路径,参考如下:
+```bash
+ self.workspace_dir = './checkpoints'
+ self.lasot_dir = '/Datasets/LaSOTBenchmark/'
+ self.coco_dir = '/Datasets/train2014/'
+ self.got10k_dir = '/Datasets/GOT-10k/train'
+ self.imagenet_dir = '/Datasets/ILSVRC2015/'
+```
+另外,训练ATOM时,需要准备got10k和lasot的数据集划分文件,方式如下:
+```bash
+cd ltr/data_specs/
+wget https://paddlemodels.cdn.bcebos.com/paddle_track/vot/got10k_lasot_split.tar
+tar xvf got10k_lasot_split.tar
+```
+
+
+### 启动训练
+
+```bash
+# 到训练代码目录
+cd ltr
+
+# 训练 ATOM ResNet18
+python run_training.py bbreg atom_res18_vid_lasot_coco
+
+# 训练 ATOM ResNet50
+python run_training.py bbreg atom_res50_vid_lasot_coco
+
+# 训练 SiamFC
+python run_training.py siamfc siamfc_alexnet_vid
+```
+
+
+## 模型评估
+
+评估训练后的模型使用[pysot-toolkit](https://github.com/StrangerZhang/pysot-toolkit)工具包,其提供了多个单目标跟踪数据集的评估API。测试数据集建议从pysot-toolkit 提供的链接中下载。
+
+准备好测试数据后,使用如下命令,克隆跟踪评估pysot-toolkit的代码模块,运行如下命令:
+
+```bash
+cd pytracking
+git clone https://github.com/StrangerZhang/pysot-toolkit.git
+mv pysot-toolkit pysot_toolkit
+cd pysot_toolkit
+pip install -r requirements.txt
+cd pysot/utils/
+python setup.py build_ext --inplace
+```
+
+### 测试数据集准备
+按照pysot-toolkit的方式准备数据集VOT2018,放到/Datasets 文件夹下。
+
+### 设置模型评估环境
+接下来开始设置评估环境:
+```bash
+# 在pytracking/admin/local.py文件中设置测试数据集、待测试模型、以及测试结果的保存路径
+# 用你常用的编辑器编辑 pytracking/admin/local.py
+# 比方说,vim pytracking/admin/local.py
+# 其中 settings.dataset_path 和 settings.network_path 分别设置为测试集的路径和模型训练参数的路径
+
+# 如果不存在 pytracking/admin/local.py,可以使用代码生成
+python -c "from pytracking.admin.environment import create_default_local_file; create_default_local_file()"
+```
+
+### 准备测试数据和模型
+按照pysot-toolkit的方式准备数据集VOT2018,放到settings.dataset_path指定文件夹中,或者自行设置settings.dataset_path指向测试数据集。
+
+
+将自己训练的模型拷贝到 `NETWORK_PATH`,或者建立软链接,如
+```bash
+ln -s tracking/ltr/Logs/checkpoints/ltr/bbreg/ $NETWORK_PATH/bbreg
+```
+
+### 开始测试:
+
+测试ATOM模型:
+```bash
+# 在VOT2018上评测ATOM模型
+# -d VOT2018 表示使用VOT2018数据集进行评测
+# -tr bbreg.atom_res18_vid_lasot_coco 表示要评测的模型,和训练保持一致
+# -te atom.default_vot 表示加载定义超参数的文件pytracking/parameter/atom/default_vot.py
+# -e 40 表示使用第40个epoch的模型进行评测,也可以设置为'range(1, 50, 1)' 表示测试从第1个epoch到第50个epoch模型
+# -n 15 表示测试15次取平均结果,默认值是1
+python eval_benchmark.py -d VOT2018 -tr bbreg.atom_res18_vid_lasot_coco -te atom.default_vot -e 40 -n 15
+```
+
+测试SiamFC
+```
+# 在VOT2018上测试SiamFC
+python eval_benchmark.py -d VOT2018 -tr siamfc.siamfc_alexnet_vid -te siamfc.default -e 'range(1, 50, 1)'
+```
+
+
+
+## 跟踪结果可视化
+
+
+在数据集上评测完后,可以通过可视化跟踪器的结果来定位问题。我们提供下面的方法来可视化跟踪结果:
+```bash
+cd pytracking
+
+# 开启 jupyter notebook,请留意终端是否输出 token
+jupyter notebook --ip 0.0.0.0 --port 8888
+```
+
+在你的浏览器中输入服务器的 IP 地址加上端口号,若是在本地执行则打开
+`http://localhost:8888`。若需要输入 token 请查看执行 `jupyter notebook --ip 0.0.0.0 --port 8888` 命令时的终端输出。
+
+打开网页之后,打开 `visualize_results_on_benchmark.ipynb` 来可视化结果。
+
+## 指标结果
+
+| 数据集 | 模型 | Backbone | 论文结果 | 训练结果 | 模型|
+| :-------: | :-------: | :---: | :---: | :---------: |:---------: |
+|VOT2018| ATOM | Res18 | EAO: 0.401 | 0.399 | [model]() |
+|VOT2018| ATOM | AlexNet | EAO: 0.188 | 0.211 | [model]() |
+
+## 引用与参考
+
+SiamFC **[[Paper]](https://arxiv.org/pdf/1811.07628.pdf) [[Code]](https://www.robots.ox.ac.uk/~luca/siamese-fc.html)**
+
+ @inproceedings{bertinetto2016fully,
+ title={Fully-convolutional siamese networks for object tracking},
+ author={Bertinetto, Luca and Valmadre, Jack and Henriques, Joao F and Vedaldi, Andrea and Torr, Philip HS},
+ booktitle={European conference on computer vision},
+ pages={850--865},
+ year={2016},
+ organization={Springer}
+ }
+
+ATOM **[[Paper]](https://arxiv.org/pdf/1811.07628.pdf) [[Raw results]](https://drive.google.com/drive/folders/1MdJtsgr34iJesAgL7Y_VelP8RvQm_IG_) [[Models]](https://drive.google.com/open?id=1EsNSQr25qfXHYLqjZaVZElbGdUg-nyzd) [[Training Code]](https://github.com/visionml/pytracking/blob/master/ltr/README.md#ATOM) [[Tracker Code]](https://github.com/visionml/pytracking/blob/master/pytracking/README.md#ATOM)**
+
+ @inproceedings{danelljan2019atom,
+ title={Atom: Accurate tracking by overlap maximization},
+ author={Danelljan, Martin and Bhat, Goutam and Khan, Fahad Shahbaz and Felsberg, Michael},
+ booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+ pages={4660--4669},
+ year={2019}
+ }
+
+DiMP **[[Paper]](https://arxiv.org/pdf/1904.07220v1.pdf) [[Raw results]](https://drive.google.com/drive/folders/15mpUAJmzxemnOC6gmvMTCDJ-0v6hxJ7y) [[Models]](https://drive.google.com/open?id=1YEJySjhFokyQ6zgQg6vFAnzEFi1Onq7G) [[Training Code]](https://github.com/visionml/pytracking/blob/master/ltr/README.md#DiMP) [[Tracker Code]](https://github.com/visionml/pytracking/blob/master/pytracking/README.md#DiMP)**
+
+ @inproceedings{bhat2019learning,
+ title={Learning discriminative model prediction for tracking},
+ author={Bhat, Goutam and Danelljan, Martin and Gool, Luc Van and Timofte, Radu},
+ booktitle={Proceedings of the IEEE International Conference on Computer Vision},
+ pages={6182--6191},
+ year={2019}
+ }
+
+ECO **[[Paper]](https://arxiv.org/pdf/1611.09224.pdf) [[Models]](https://drive.google.com/open?id=1aWC4waLv_te-BULoy0k-n_zS-ONms21S) [[Tracker Code]](https://github.com/visionml/pytracking/blob/master/pytracking/README.md#ECO)**
+
+ @inproceedings{danelljan2017eco,
+ title={Eco: Efficient convolution operators for tracking},
+ author={Danelljan, Martin and Bhat, Goutam and Shahbaz Khan, Fahad and Felsberg, Michael},
+ booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+ pages={6638--6646},
+ year={2017}
+ }
diff --git a/PaddleCV/tracking/imgs/ball1.gif b/PaddleCV/tracking/imgs/ball1.gif
new file mode 100644
index 0000000000000000000000000000000000000000..e8fd8ca45a55ebe13e0d98adf39db3bc4eeac7d6
Binary files /dev/null and b/PaddleCV/tracking/imgs/ball1.gif differ
diff --git a/PaddleCV/tracking/ltr/actors/__init__.py b/PaddleCV/tracking/ltr/actors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b89e0f6077181d2d66cd2f546a2d9bf7aac7db4
--- /dev/null
+++ b/PaddleCV/tracking/ltr/actors/__init__.py
@@ -0,0 +1,3 @@
+from .base_actor import BaseActor
+from .bbreg import AtomActor
+from .siamfc import SiamFCActor
diff --git a/PaddleCV/tracking/ltr/actors/base_actor.py b/PaddleCV/tracking/ltr/actors/base_actor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4b456c97a1d1a0b267e058dd553c07d3b94367f
--- /dev/null
+++ b/PaddleCV/tracking/ltr/actors/base_actor.py
@@ -0,0 +1,26 @@
+from pytracking.libs import TensorDict
+
+
+class BaseActor:
+ """ Base class for actor. The actor class handles the passing of the data through the network
+ and calculation the loss"""
+
+ def __init__(self, net, objective):
+ """
+ args:
+ net - The network to train
+ objective - The loss function
+ """
+ self.net = net
+ self.objective = objective
+
+ def train(self):
+ """ Set whether the network is in train mode.
+ args:
+ mode (True) - Bool specifying whether in training mode.
+ """
+ self.net.train()
+
+ def eval(self):
+ """ Set network to eval mode"""
+ self.net.eval()
diff --git a/PaddleCV/tracking/ltr/actors/bbreg.py b/PaddleCV/tracking/ltr/actors/bbreg.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed74bd7ee19ab45b3d6c982abe6e7348657f3b67
--- /dev/null
+++ b/PaddleCV/tracking/ltr/actors/bbreg.py
@@ -0,0 +1,38 @@
+from . import BaseActor
+import paddle.fluid as fluid
+
+
+class AtomActor(BaseActor):
+ """ Actor for training the IoU-Net in ATOM"""
+
+ def __call__(self, data):
+ """
+ args:
+ data - The input data, should contain the fields 'train_images', 'test_images', 'train_anno',
+ 'test_proposals' and 'proposal_iou'.
+
+ returns:
+ loss - the training loss
+ states - dict containing detailed losses
+ """
+ # Run network to obtain IoU prediction for each proposal in 'test_proposals'
+ iou_pred = self.net(data['train_images'], data['test_images'],
+ data['train_anno'], data['test_proposals'])
+
+ iou_pred = fluid.layers.reshape(iou_pred, [-1, iou_pred.shape[2]])
+ iou_gt = fluid.layers.reshape(data['proposal_iou'],
+ [-1, data['proposal_iou'].shape[2]])
+
+ # Compute loss
+ loss = self.objective(iou_pred, iou_gt)
+ loss = fluid.layers.mean(loss)
+
+ # Use scale loss if exists
+ scale_loss = getattr(self.net, "scale_loss", None)
+ if callable(scale_loss):
+ loss = scale_loss(loss)
+
+ # Return training stats
+ stats = {'Loss/total': loss.numpy(), 'Loss/iou': loss.numpy()}
+
+ return loss, stats
diff --git a/PaddleCV/tracking/ltr/actors/siamfc.py b/PaddleCV/tracking/ltr/actors/siamfc.py
new file mode 100644
index 0000000000000000000000000000000000000000..d42970875a8bcb808ec23340307cbaef795332ca
--- /dev/null
+++ b/PaddleCV/tracking/ltr/actors/siamfc.py
@@ -0,0 +1,46 @@
+import numpy as np
+import paddle.fluid as fluid
+
+from . import BaseActor
+
+
+class SiamFCActor(BaseActor):
+ """ Actor for training the IoU-Net in ATOM"""
+
+ def __init__(self, net, objective, batch_size, shape, radius, stride):
+ super().__init__(net, objective)
+ self.label_mask, self.label_weights = self._creat_gt_mask(
+ batch_size, shape, radius, stride)
+
+ def _creat_gt_mask(self, batch_size, shape, radius, stride):
+ h, w = shape
+ y = np.arange(h, dtype=np.float32) - (h - 1) / 2.
+ x = np.arange(w, dtype=np.float32) - (w - 1) / 2.
+ y, x = np.meshgrid(y, x)
+ dist = np.sqrt(x**2 + y**2)
+ mask = np.zeros((h, w))
+ mask[dist <= radius / stride] = 1
+ mask = mask[np.newaxis, :, :]
+ weights = np.ones_like(mask)
+ weights[mask == 1] = 0.5 / np.sum(mask == 1)
+ weights[mask == 0] = 0.5 / np.sum(mask == 0)
+ mask = np.repeat(mask, batch_size, axis=0)[:, np.newaxis, :, :]
+ weights = np.repeat(weights, batch_size, axis=0)[:, np.newaxis, :, :]
+ weights = fluid.dygraph.to_variable(weights.astype(np.float32))
+ mask = fluid.dygraph.to_variable(mask.astype(np.float32))
+ return mask, weights
+
+ def __call__(self, data):
+ # Run network to obtain IoU prediction for each proposal in 'test_proposals'
+ target_estimations = self.net(data['train_images'], data['test_images'])
+
+ # weighted loss
+ loss_mat = fluid.layers.sigmoid_cross_entropy_with_logits(
+ target_estimations, self.label_mask, normalize=False)
+ loss = fluid.layers.elementwise_mul(loss_mat, self.label_weights)
+ loss = fluid.layers.reduce_sum(loss) / loss.shape[0]
+
+ # Return training stats
+ stats = {'Loss/total': loss.numpy(), 'Loss/center': loss.numpy()}
+
+ return loss, stats
diff --git a/PaddleCV/tracking/ltr/admin/__init__.py b/PaddleCV/tracking/ltr/admin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/PaddleCV/tracking/ltr/admin/environment.py b/PaddleCV/tracking/ltr/admin/environment.py
new file mode 100644
index 0000000000000000000000000000000000000000..590d56d878298c316eb38ac37be75a86a19c9d64
--- /dev/null
+++ b/PaddleCV/tracking/ltr/admin/environment.py
@@ -0,0 +1,53 @@
+import importlib
+import os
+from collections import OrderedDict
+
+
+def create_default_local_file():
+ path = os.path.join(os.path.dirname(__file__), 'local.py')
+
+ empty_str = '\'\''
+ default_settings = OrderedDict({
+ 'workspace_dir': empty_str,
+ 'tensorboard_dir': 'self.workspace_dir + \'/tensorboard/\'',
+ 'backbone_dir': empty_str,
+ 'lasot_dir': empty_str,
+ 'got10k_dir': empty_str,
+ 'trackingnet_dir': empty_str,
+ 'coco_dir': empty_str,
+ 'imagenet_dir': empty_str,
+ 'imagenetdet_dir': empty_str
+ })
+
+ comment = {
+ 'workspace_dir': 'Base directory for saving network checkpoints.',
+ 'tensorboard_dir': 'Directory for tensorboard files.'
+ }
+
+ with open(path, 'w') as f:
+ f.write('class EnvironmentSettings:\n')
+ f.write(' def __init__(self):\n')
+
+ for attr, attr_val in default_settings.items():
+ comment_str = None
+ if attr in comment:
+ comment_str = comment[attr]
+ if comment_str is None:
+ f.write(' self.{} = {}\n'.format(attr, attr_val))
+ else:
+ f.write(' self.{} = {} # {}\n'.format(attr, attr_val,
+ comment_str))
+
+
+def env_settings():
+ env_module_name = 'ltr.admin.local'
+ try:
+ env_module = importlib.import_module(env_module_name)
+ return env_module.EnvironmentSettings()
+ except:
+ env_file = os.path.join(os.path.dirname(__file__), 'local.py')
+
+ create_default_local_file()
+ raise RuntimeError(
+ 'YOU HAVE NOT SETUP YOUR local.py!!!\n Go to "{}" and set all the paths you need. Then try to run again.'.
+ format(env_file))
diff --git a/PaddleCV/tracking/ltr/admin/local.py b/PaddleCV/tracking/ltr/admin/local.py
new file mode 100644
index 0000000000000000000000000000000000000000..f598f81482ab3bb74365b2dfc3da04362825f4b6
--- /dev/null
+++ b/PaddleCV/tracking/ltr/admin/local.py
@@ -0,0 +1,11 @@
+class EnvironmentSettings:
+ def __init__(self):
+ self.workspace_dir = '' # Base directory for saving network checkpoints.
+ self.tensorboard_dir = self.workspace_dir + '/tensorboard/' # Directory for tensorboard files.
+ self.backbone_dir = ''
+ self.lasot_dir = ''
+ self.got10k_dir = ''
+ self.trackingnet_dir = ''
+ self.coco_dir = ''
+ self.imagenet_dir = ''
+ self.imagenetdet_dir = ''
diff --git a/PaddleCV/tracking/ltr/admin/model_constructor.py b/PaddleCV/tracking/ltr/admin/model_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..89ed126ab88b6c52edb6e9d0ccaac233eecff714
--- /dev/null
+++ b/PaddleCV/tracking/ltr/admin/model_constructor.py
@@ -0,0 +1,48 @@
+from functools import wraps
+import importlib
+
+
+def model_constructor(f):
+ """ Wraps the function 'f' which returns the network. An extra field 'constructor' is added to the network returned
+ by 'f'. This field contains an instance of the 'NetConstructor' class, which contains the information needed to
+ re-construct the network, such as the name of the function 'f', the function arguments etc. Thus, the network can
+ be easily constructed from a saved checkpoint by calling NetConstructor.get() function.
+ """
+
+ @wraps(f)
+ def f_wrapper(*args, **kwds):
+ net_constr = NetConstructor(f.__name__, f.__module__, args, kwds)
+ output = f(*args, **kwds)
+ if isinstance(output, (tuple, list)):
+ # Assume first argument is the network
+ output[0].constructor = net_constr
+ else:
+ output.constructor = net_constr
+ return output
+
+ return f_wrapper
+
+
+class NetConstructor:
+ """ Class to construct networks. Takes as input the function name (e.g. atom_resnet18), the name of the module
+ which contains the network function (e.g. ltr.models.bbreg.atom) and the arguments for the network
+ function. The class object can then be stored along with the network weights to re-construct the network."""
+
+ def __init__(self, fun_name, fun_module, args, kwds):
+ """
+ args:
+ fun_name - The function which returns the network
+ fun_module - the module which contains the network function
+ args - arguments which are passed to the network function
+ kwds - arguments which are passed to the network function
+ """
+ self.fun_name = fun_name
+ self.fun_module = fun_module
+ self.args = args
+ self.kwds = kwds
+
+ def get(self):
+ """ Rebuild the network by calling the network function with the correct arguments. """
+ net_module = importlib.import_module(self.fun_module)
+ net_fun = getattr(net_module, self.fun_name)
+ return net_fun(*self.args, **self.kwds)
diff --git a/PaddleCV/tracking/ltr/admin/settings.py b/PaddleCV/tracking/ltr/admin/settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..2698b2427357590287092b539bc863447ffe391d
--- /dev/null
+++ b/PaddleCV/tracking/ltr/admin/settings.py
@@ -0,0 +1,12 @@
+from ltr.admin.environment import env_settings
+
+
+class Settings:
+ """ Training settings, e.g. the paths to datasets and networks."""
+
+ def __init__(self):
+ self.set_default()
+
+ def set_default(self):
+ self.env = env_settings()
+ self.use_gpu = True
diff --git a/PaddleCV/tracking/ltr/admin/stats.py b/PaddleCV/tracking/ltr/admin/stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4b46ed18955afc3b1d7a6c9f28400998a0a2f79
--- /dev/null
+++ b/PaddleCV/tracking/ltr/admin/stats.py
@@ -0,0 +1,70 @@
+class StatValue:
+ def __init__(self):
+ self.clear()
+
+ def reset(self):
+ self.val = 0
+
+ def clear(self):
+ self.reset()
+ self.history = []
+
+ def update(self, val):
+ self.val = val
+ self.history.append(self.val)
+
+
+class AverageMeter(object):
+ """Computes and stores the average and current value"""
+
+ def __init__(self):
+ self.clear()
+ self.has_new_data = False
+
+ def reset(self):
+ self.avg = 0
+ self.val = 0
+ self.sum = 0
+ self.count = 0
+
+ def clear(self):
+ self.reset()
+ self.history = []
+
+ def update(self, val, n=1):
+ self.val = val
+ self.sum += val * n
+ self.count += n
+ self.avg = self.sum / self.count
+
+ def new_epoch(self):
+ if self.count > 0:
+ self.history.append(self.avg)
+ self.reset()
+ self.has_new_data = True
+ else:
+ self.has_new_data = False
+
+
+def topk_accuracy(output, target, topk=(1, )):
+ """Computes the precision@k for the specified values of k"""
+ single_input = not isinstance(topk, (tuple, list))
+ if single_input:
+ topk = (topk, )
+
+ maxk = max(topk)
+ batch_size = target.size(0)
+
+ _, pred = output.topk(maxk, 1, True, True)
+ pred = pred.t()
+ correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+ res = []
+ for k in topk:
+ correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)[0]
+ res.append(correct_k * 100.0 / batch_size)
+
+ if single_input:
+ return res[0]
+
+ return res
diff --git a/PaddleCV/tracking/ltr/admin/tensorboard.py b/PaddleCV/tracking/ltr/admin/tensorboard.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6d4de6965a67ee46359c596df264956f52a4173
--- /dev/null
+++ b/PaddleCV/tracking/ltr/admin/tensorboard.py
@@ -0,0 +1,29 @@
+import os
+from collections import OrderedDict
+from tensorboardX import SummaryWriter
+
+
+class TensorboardWriter:
+ def __init__(self, directory, loader_names):
+ self.directory = directory
+ self.writer = OrderedDict({
+ name: SummaryWriter(os.path.join(self.directory, name))
+ for name in loader_names
+ })
+
+ def write_info(self, module_name, script_name, description):
+ tb_info_writer = SummaryWriter(os.path.join(self.directory, 'info'))
+ tb_info_writer.add_text('Modulet_name', module_name)
+ tb_info_writer.add_text('Script_name', script_name)
+ tb_info_writer.add_text('Description', description)
+ tb_info_writer.close()
+
+ def write_epoch(self, stats: OrderedDict, epoch: int, ind=-1):
+ for loader_name, loader_stats in stats.items():
+ if loader_stats is None:
+ continue
+ for var_name, val in loader_stats.items():
+ if hasattr(val, 'history') and getattr(val, 'has_new_data',
+ True):
+ self.writer[loader_name].add_scalar(var_name,
+ val.history[ind], epoch)
diff --git a/PaddleCV/tracking/ltr/data/__init__.py b/PaddleCV/tracking/ltr/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ffe126ba0443207aa426d0f3e2b19272de6236d
--- /dev/null
+++ b/PaddleCV/tracking/ltr/data/__init__.py
@@ -0,0 +1 @@
+from .loader import LTRLoader
diff --git a/PaddleCV/tracking/ltr/data/image_loader.py b/PaddleCV/tracking/ltr/data/image_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..12bda4ee44ca382522119b692d70360a50bb8ba0
--- /dev/null
+++ b/PaddleCV/tracking/ltr/data/image_loader.py
@@ -0,0 +1,63 @@
+import jpeg4py
+import cv2 as cv
+import lmdb
+import numpy as np
+
+
+def default_image_loader(path):
+ """The default image loader, reads the image from the given path. It first tries to use the jpeg4py_loader,
+ but reverts to the opencv_loader if the former is not available."""
+ if default_image_loader.use_jpeg4py is None:
+ # Try using jpeg4py
+ im = jpeg4py_loader(path)
+ if im is None:
+ default_image_loader.use_jpeg4py = False
+ print('Using opencv_loader instead.')
+ else:
+ default_image_loader.use_jpeg4py = True
+ return im
+ if default_image_loader.use_jpeg4py:
+ return jpeg4py_loader(path)
+ return opencv_loader(path)
+
+
+default_image_loader.use_jpeg4py = None
+
+
+def jpeg4py_loader(path):
+ """ Image reading using jpeg4py (https://github.com/ajkxyz/jpeg4py)"""
+ try:
+ return jpeg4py.JPEG(path).decode()
+ except Exception as e:
+ print('ERROR: Could not read image "{}"'.format(path))
+ print(e)
+ return None
+
+
+def opencv_loader(path):
+ """ Read image using opencv's imread function and returns it in rgb format"""
+ try:
+ im = cv.imread(path, cv.IMREAD_COLOR)
+ # convert to rgb and return
+ return cv.cvtColor(im, cv.COLOR_BGR2RGB)
+ except Exception as e:
+ print('ERROR: Could not read image "{}"'.format(path))
+ print(e)
+ return None
+
+
+def lmdb_loader(path, lmdb_path=None):
+ try:
+ if lmdb_loader.txn is None:
+ db = lmdb.open(lmdb_path, readonly=True, map_size=int(300e9))
+ lmdb_loader.txn = db.begin(write=False)
+ img_buffer = lmdb_loader.txn.get(path.encode())
+ img_buffer = np.frombuffer(img_buffer, np.uint8)
+ return cv.imdecode(img_buffer, cv.IMREAD_COLOR)
+ except Exception as e:
+ print('ERROR: Could not read image "{}"'.format(path))
+ print(e)
+ return None
+
+
+lmdb_loader.txn = None
diff --git a/PaddleCV/tracking/ltr/data/loader.py b/PaddleCV/tracking/ltr/data/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d7c8f11c95cb3621fe57f125efb8fce6e5434e8
--- /dev/null
+++ b/PaddleCV/tracking/ltr/data/loader.py
@@ -0,0 +1,115 @@
+import os
+import signal
+import sys
+
+import dataflow as df
+import numpy as np
+
+
+# handle terminate reader process, do not print stack frame
+def _reader_quit(signum, frame):
+ print("Reader process exit.")
+ sys.exit()
+
+
+def _term_group(sig_num, frame):
+ print('pid {} terminated, terminate group '
+ '{}...'.format(os.getpid(), os.getpgrp()))
+ os.killpg(os.getpgid(os.getpid()), signal.SIGKILL)
+
+
+signal.signal(signal.SIGTERM, _reader_quit)
+signal.signal(signal.SIGINT, _term_group)
+
+
+class LTRLoader(df.DataFlow):
+ """
+ Data loader. Combines a dataset and a sampler, and provides
+ single- or multi-process iterators over the dataset.
+
+ Note: an additional option stack_dim is available to
+ select along which dimension the data should be stacked to form a batch.
+
+ Arguments:
+ dataset (Dataset): dataset from which to load the data.
+ batch_size (int, optional): how many samples per batch to load
+ (default: 1).
+ shuffle (bool, optional): set to ``True`` to have the data reshuffled
+ at every epoch (default: False).
+ sampler (Sampler, optional): defines the strategy to draw samples from
+ the dataset. If specified, ``shuffle`` must be False.
+ batch_sampler (Sampler, optional): like sampler, but returns a batch of
+ indices at a time. Mutually exclusive with batch_size, shuffle,
+ sampler, and drop_last.
+ num_workers (int, optional): how many subprocesses to use for data
+ loading. 0 means that the data will be loaded in the main process.
+ (default: 0)
+ collate_fn (callable, optional): merges a list of samples to form a mini-batch.
+ stack_dim (int): Dimension along which to stack to form the batch. (default: 0)
+ pin_memory (bool, optional): If ``True``, the data loader will copy tensors
+ into CUDA pinned memory before returning them.
+ drop_last (bool, optional): set to ``True`` to drop the last incomplete batch,
+ if the dataset size is not divisible by the batch size. If ``False`` and
+ the size of dataset is not divisible by the batch size, then the last batch
+ will be smaller. (default: False)
+ timeout (numeric, optional): if positive, the timeout value for collecting a batch
+ from workers. Should always be non-negative. (default: 0)
+ worker_init_fn (callable, optional): If not None, this will be called on each
+ worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as
+ input, after seeding and before data loading. (default: None)
+
+
+ .. warning:: If ``spawn`` start method is used, :attr:`worker_init_fn` cannot be an
+ unpicklable object, e.g., a lambda function.
+ """
+
+ __initialized = False
+
+ def __init__(self,
+ name,
+ dataset,
+ training=True,
+ batch_size=1,
+ shuffle=False,
+ sampler=None,
+ batch_sampler=None,
+ num_workers=0,
+ epoch_interval=1,
+ collate_fn=None,
+ stack_dim=0,
+ pin_memory=False,
+ drop_last=False,
+ timeout=0,
+ worker_init_fn=None):
+
+ super().__init__()
+
+ ds = df.RepeatedData(dataset, -1)
+ ds = df.MultiProcessRunnerZMQ(ds, num_proc=num_workers, hwm=300)
+ # ds = df.MultiThreadRunner(lambda: ds, num_prefetch=1024, num_thread=num_workers)
+ ds = df.BatchData(ds, batch_size)
+ self.ds = ds
+
+ self.name = name
+ self.training = training
+ self.epoch_interval = epoch_interval
+ self.stack_dim = stack_dim
+ self.batches_per_epoch = len(dataset) // batch_size
+
+ def __len__(self):
+ return self.batches_per_epoch
+
+ def __iter__(self):
+ if not self.__initialized:
+ self.reset_state()
+ self.__initialized = True
+
+ for d in self.ds:
+ if self.stack_dim > 0:
+ for k, v in d.items():
+ if len(v.shape) >= self.stack_dim + 1:
+ d[k] = np.swapaxes(v, 0, self.stack_dim)
+ yield d
+
+ def reset_state(self):
+ self.ds.reset_state()
diff --git a/PaddleCV/tracking/ltr/data/processing.py b/PaddleCV/tracking/ltr/data/processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab207da0020d38ce47419c0053bab12a37bcf81b
--- /dev/null
+++ b/PaddleCV/tracking/ltr/data/processing.py
@@ -0,0 +1,262 @@
+import numpy as np
+
+from ltr.data import transforms
+import ltr.data.processing_utils as prutils
+from pytracking.libs import TensorDict
+
+
+class BaseProcessing:
+ """ Base class for Processing. Processing class is used to process the data returned by a dataset, before passing it
+ through the network. For example, it can be used to crop a search region around the object, apply various data
+ augmentations, etc."""
+
+ def __init__(self,
+ transform=transforms.ToArray(),
+ train_transform=None,
+ test_transform=None,
+ joint_transform=None):
+ """
+ args:
+ transform - The set of transformations to be applied on the images. Used only if train_transform or
+ test_transform is None.
+ train_transform - The set of transformations to be applied on the train images. If None, the 'transform'
+ argument is used instead.
+ test_transform - The set of transformations to be applied on the test images. If None, the 'transform'
+ argument is used instead.
+ joint_transform - The set of transformations to be applied 'jointly' on the train and test images. For
+ example, it can be used to convert both test and train images to grayscale.
+ """
+ self.transform = {
+ 'train': transform if train_transform is None else train_transform,
+ 'test': transform if test_transform is None else test_transform,
+ 'joint': joint_transform
+ }
+
+ def __call__(self, data: TensorDict):
+ raise NotImplementedError
+
+
+class SiamFCProcessing(BaseProcessing):
+ def __init__(self,
+ search_area_factor,
+ output_sz,
+ center_jitter_factor,
+ scale_jitter_factor,
+ mode='pair',
+ scale_type='context',
+ border_type='meanpad',
+ *args,
+ **kwargs):
+ super().__init__(*args, **kwargs)
+ self.search_area_factor = search_area_factor
+ self.output_sz = output_sz
+ self.center_jitter_factor = center_jitter_factor
+ self.scale_jitter_factor = scale_jitter_factor
+ self.mode = mode
+ self.scale_type = scale_type
+ self.border_type = border_type
+
+ def _get_jittered_box(self, box, mode, rng):
+ jittered_size = box[2:4] * np.exp(
+ rng.randn(2) * self.scale_jitter_factor[mode])
+ max_offset = (np.sqrt(jittered_size.prod()) *
+ self.center_jitter_factor[mode])
+ jittered_center = box[0:2] + 0.5 * box[2:4] + max_offset * (rng.rand(2)
+ - 0.5)
+
+ return np.concatenate(
+ (jittered_center - 0.5 * jittered_size, jittered_size), axis=0)
+
+ def __call__(self, data: TensorDict, rng=None):
+ # Apply joint transforms
+ if self.transform['joint'] is not None:
+ num_train_images = len(data['train_images'])
+ all_images = data['train_images'] + data['test_images']
+ all_images_trans = self.transform['joint'](*all_images)
+
+ data['train_images'] = all_images_trans[:num_train_images]
+ data['test_images'] = all_images_trans[num_train_images:]
+
+ for s in ['train', 'test']:
+ assert self.mode == 'sequence' or len(data[s + '_images']) == 1, \
+ "In pair mode, num train/test frames must be 1"
+
+ # Add a uniform noise to the center pos
+ jittered_anno = [
+ self._get_jittered_box(a, s, rng) for a in data[s + '_anno']
+ ]
+
+ # Crop image region centered at jittered_anno box
+ try:
+ crops, boxes = prutils.jittered_center_crop(
+ data[s + '_images'],
+ jittered_anno,
+ data[s + '_anno'],
+ self.search_area_factor[s],
+ self.output_sz[s],
+ scale_type=self.scale_type,
+ border_type=self.border_type)
+ except Exception as e:
+ print('{}, anno: {}'.format(data['dataset'], data[s + '_anno']))
+ raise e
+
+ # Apply transforms
+ data[s + '_images'] = [self.transform[s](x) for x in crops]
+ data[s + '_anno'] = boxes
+
+ # Prepare output
+ if self.mode == 'sequence':
+ data = data.apply(prutils.stack_tensors)
+ else:
+ data = data.apply(lambda x: x[0] if isinstance(x, list) else x)
+
+ return data
+
+
+class ATOMProcessing(BaseProcessing):
+ """ The processing class used for training ATOM. The images are processed in the following way.
+ First, the target bounding box is jittered by adding some noise. Next, a square region (called search region )
+ centered at the jittered target center, and of area search_area_factor^2 times the area of the jittered box is
+ cropped from the image. The reason for jittering the target box is to avoid learning the bias that the target is
+ always at the center of the search region. The search region is then resized to a fixed size given by the
+ argument output_sz. A set of proposals are then generated for the test images by jittering the ground truth box.
+
+ """
+
+ def __init__(self,
+ search_area_factor,
+ output_sz,
+ center_jitter_factor,
+ scale_jitter_factor,
+ proposal_params,
+ mode='pair',
+ *args,
+ **kwargs):
+ """
+ args:
+ search_area_factor - The size of the search region relative to the target size.
+ output_sz - An integer, denoting the size to which the search region is resized. The search region is always
+ square.
+ center_jitter_factor - A dict containing the amount of jittering to be applied to the target center before
+ extracting the search region. See _get_jittered_box for how the jittering is done.
+ scale_jitter_factor - A dict containing the amount of jittering to be applied to the target size before
+ extracting the search region. See _get_jittered_box for how the jittering is done.
+ proposal_params - Arguments for the proposal generation process. See _generate_proposals for details.
+ mode - Either 'pair' or 'sequence'. If mode='sequence', then output has an extra dimension for frames
+ """
+ super().__init__(*args, **kwargs)
+ self.search_area_factor = search_area_factor
+ self.output_sz = output_sz
+ self.center_jitter_factor = center_jitter_factor
+ self.scale_jitter_factor = scale_jitter_factor
+ self.proposal_params = proposal_params
+ self.mode = mode
+
+ def _get_jittered_box(self, box, mode, rng):
+ """ Jitter the input box
+ args:
+ box - input bounding box
+ mode - string 'train' or 'test' indicating train or test data
+
+ returns:
+ Variable - jittered box
+ """
+
+ jittered_size = box[2:4] * np.exp(
+ rng.randn(2) * self.scale_jitter_factor[mode])
+ max_offset = (np.sqrt(jittered_size.prod()) *
+ self.center_jitter_factor[mode])
+ jittered_center = box[0:2] + 0.5 * box[2:4] + max_offset * (rng.rand(2)
+ - 0.5)
+
+ return np.concatenate(
+ (jittered_center - 0.5 * jittered_size, jittered_size), axis=0)
+
+ def _generate_proposals(self, box, rng):
+ """ Generates proposals by adding noise to the input box
+ args:
+ box - input box
+
+ returns:
+ array - Array of shape (num_proposals, 4) containing proposals
+ array - Array of shape (num_proposals,) containing IoU overlap of each proposal with the input box. The
+ IoU is mapped to [-1, 1]
+ """
+ # Generate proposals
+ num_proposals = self.proposal_params['boxes_per_frame']
+ proposals = np.zeros((num_proposals, 4))
+ gt_iou = np.zeros(num_proposals)
+
+ for i in range(num_proposals):
+ proposals[i, :], gt_iou[i] = prutils.perturb_box(
+ box,
+ min_iou=self.proposal_params['min_iou'],
+ sigma_factor=self.proposal_params['sigma_factor'],
+ rng=rng)
+
+ # Map to [-1, 1]
+ gt_iou = gt_iou * 2 - 1
+ return proposals, gt_iou
+
+ def __call__(self, data: TensorDict, rng=None):
+ """
+ args:
+ data - The input data, should contain the following fields:
+ 'train_images' -
+ 'test_images' -
+ 'train_anno' -
+ 'test_anno' -
+
+ returns:
+ TensorDict - output data block with following fields:
+ 'train_images' -
+ 'test_images' -
+ 'train_anno' -
+ 'test_anno' -
+ 'test_proposals'-
+ 'proposal_iou' -
+ """
+ # Apply joint transforms
+ if self.transform['joint'] is not None:
+ num_train_images = len(data['train_images'])
+ all_images = data['train_images'] + data['test_images']
+ all_images_trans = self.transform['joint'](*all_images)
+
+ data['train_images'] = all_images_trans[:num_train_images]
+ data['test_images'] = all_images_trans[num_train_images:]
+
+ for s in ['train', 'test']:
+ assert self.mode == 'sequence' or len(data[s + '_images']) == 1, \
+ "In pair mode, num train/test frames must be 1"
+
+ # Add a uniform noise to the center pos
+ jittered_anno = [
+ self._get_jittered_box(a, s, rng) for a in data[s + '_anno']
+ ]
+
+ # Crop image region centered at jittered_anno box
+ try:
+ crops, boxes = prutils.jittered_center_crop(
+ data[s + '_images'], jittered_anno, data[s + '_anno'],
+ self.search_area_factor, self.output_sz)
+ except Exception as e:
+ print('{}, anno: {}'.format(data['dataset'], data[s + '_anno']))
+ raise e
+ # Apply transforms
+ data[s + '_images'] = [self.transform[s](x) for x in crops]
+ data[s + '_anno'] = boxes
+
+ # Generate proposals
+ frame2_proposals, gt_iou = zip(
+ * [self._generate_proposals(a, rng) for a in data['test_anno']])
+
+ data['test_proposals'] = list(frame2_proposals)
+ data['proposal_iou'] = list(gt_iou)
+
+ # Prepare output
+ if self.mode == 'sequence':
+ data = data.apply(prutils.stack_tensors)
+ else:
+ data = data.apply(lambda x: x[0] if isinstance(x, list) else x)
+
+ return data
diff --git a/PaddleCV/tracking/ltr/data/processing_utils.py b/PaddleCV/tracking/ltr/data/processing_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f38286ff823c0242f1c6024073fd8a557fb767f
--- /dev/null
+++ b/PaddleCV/tracking/ltr/data/processing_utils.py
@@ -0,0 +1,288 @@
+import math
+import numpy as np
+import cv2 as cv
+
+
+def stack_tensors(x):
+ if isinstance(x, list) and isinstance(x[0], np.ndarray):
+ return np.stack(x)
+ return x
+
+
+def sample_target(im,
+ target_bb,
+ search_area_factor,
+ output_sz=None,
+ scale_type='original',
+ border_type='replicate'):
+ """ Extracts a square crop centered at target_bb box, of area search_area_factor^2 times target_bb area
+
+ args:
+ im - cv image
+ target_bb - target box [x, y, w, h]
+ search_area_factor - Ratio of crop size to target size
+ output_sz - (float) Size to which the extracted crop is resized (always square). If None, no resizing is done.
+
+ returns:
+ cv image - extracted crop
+ float - the factor by which the crop has been resized to make the crop size equal output_size
+ """
+
+ x, y, w, h = target_bb.tolist()
+
+ # Crop image
+ if scale_type == 'original':
+ crop_sz = math.ceil(math.sqrt(w * h) * search_area_factor)
+ elif scale_type == 'context':
+ # some context is added into the target_size
+ # now, the search factor is respect to the "target + context"
+ # when search_factor = 1, output_size = 127
+ # when search_factor = 2, output_size = 255
+ context = (w + h) / 2
+ base_size = math.sqrt(
+ (w + context) * (h + context)) # corresponds to 127 in crop
+ crop_sz = math.ceil(search_area_factor * base_size)
+ else:
+ raise NotImplementedError
+
+ if crop_sz < 1:
+ raise Exception('Too small bounding box. w: {}, h: {}'.format(w, h))
+
+ x1 = round(x + 0.5 * w - crop_sz * 0.5)
+ x2 = x1 + crop_sz
+
+ y1 = round(y + 0.5 * h - crop_sz * 0.5)
+ y2 = y1 + crop_sz
+
+ x1_pad = max(0, -x1)
+ x2_pad = max(x2 - im.shape[1] + 1, 0)
+
+ y1_pad = max(0, -y1)
+ y2_pad = max(y2 - im.shape[0] + 1, 0)
+
+ # Crop target
+ im_crop = im[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :]
+
+ # Pad
+ if border_type == 'replicate':
+ im_crop_padded = cv.copyMakeBorder(im_crop, y1_pad, y2_pad, x1_pad,
+ x2_pad, cv.BORDER_REPLICATE)
+ elif border_type == 'zeropad':
+ im_crop_padded = cv.copyMakeBorder(im_crop, y1_pad, y2_pad, x1_pad,
+ x2_pad, cv.BORDER_CONSTANT)
+ elif border_type == 'meanpad':
+ avg_chans = np.array(
+ [np.mean(im[:, :, 0]), np.mean(im[:, :, 1]), np.mean(im[:, :, 2])])
+ im_crop_padded = cv.copyMakeBorder(
+ im_crop,
+ y1_pad,
+ y2_pad,
+ x1_pad,
+ x2_pad,
+ cv.BORDER_CONSTANT,
+ value=avg_chans)
+ else:
+ raise NotImplementedError
+
+ if output_sz is not None:
+ resize_factor = output_sz / crop_sz
+ return cv.resize(im_crop_padded, (output_sz, output_sz)), resize_factor
+ else:
+ return im_crop_padded, 1.0
+
+
+def transform_image_to_crop(box_in: np.ndarray,
+ box_extract: np.ndarray,
+ resize_factor: float,
+ crop_sz: np.ndarray) -> np.ndarray:
+ """ Transform the box co-ordinates from the original image co-ordinates to the co-ordinates of the cropped image
+ args:
+ box_in - the box for which the co-ordinates are to be transformed
+ box_extract - the box about which the image crop has been extracted.
+ resize_factor - the ratio between the original image scale and the scale of the image crop
+ crop_sz - size of the cropped image
+
+ returns:
+ array - transformed co-ordinates of box_in
+ """
+ box_extract_center = box_extract[0:2] + 0.5 * box_extract[2:4]
+
+ box_in_center = box_in[0:2] + 0.5 * box_in[2:4]
+
+ box_out_center = (crop_sz - 1) / 2 + (box_in_center - box_extract_center
+ ) * resize_factor
+ box_out_wh = box_in[2:4] * resize_factor
+
+ box_out = np.concatenate((box_out_center - 0.5 * box_out_wh, box_out_wh))
+ return box_out
+
+
+def centered_crop(frames, anno, area_factor, output_sz):
+ crops_resize_factors = [
+ sample_target(f, a, area_factor, output_sz)
+ for f, a in zip(frames, anno)
+ ]
+
+ frames_crop, resize_factors = zip(*crops_resize_factors)
+
+ crop_sz = np.array([output_sz, output_sz], 'int')
+
+ # find the bb location in the crop
+ anno_crop = [
+ transform_image_to_crop(a, a, rf, crop_sz)
+ for a, rf in zip(anno, resize_factors)
+ ]
+
+ return frames_crop, anno_crop
+
+
+def jittered_center_crop(frames,
+ box_extract,
+ box_gt,
+ search_area_factor,
+ output_sz,
+ scale_type='original',
+ border_type='replicate'):
+ """ For each frame in frames, extracts a square crop centered at box_extract, of area search_area_factor^2
+ times box_extract area. The extracted crops are then resized to output_sz. Further, the co-ordinates of the box
+ box_gt are transformed to the image crop co-ordinates
+
+ args:
+ frames - list of frames
+ box_extract - list of boxes of same length as frames. The crops are extracted using anno_extract
+ box_gt - list of boxes of same length as frames. The co-ordinates of these boxes are transformed from
+ image co-ordinates to the crop co-ordinates
+ search_area_factor - The area of the extracted crop is search_area_factor^2 times box_extract area
+ output_sz - The size to which the extracted crops are resized
+
+ returns:
+ list - list of image crops
+ list - box_gt location in the crop co-ordinates
+ """
+ crops_resize_factors = [
+ sample_target(
+ f,
+ a,
+ search_area_factor,
+ output_sz,
+ scale_type=scale_type,
+ border_type=border_type) for f, a in zip(frames, box_extract)
+ ]
+
+ frames_crop, resize_factors = zip(*crops_resize_factors)
+
+ crop_sz = np.array([output_sz, output_sz], 'int')
+
+ # find the bb location in the crop
+ box_crop = [
+ transform_image_to_crop(a_gt, a_ex, rf, crop_sz)
+ for a_gt, a_ex, rf in zip(box_gt, box_extract, resize_factors)
+ ]
+
+ return frames_crop, box_crop
+
+
+def iou(reference, proposals):
+ """Compute the IoU between a reference box with multiple proposal boxes.
+
+ args:
+ reference - Tensor of shape (1, 4).
+ proposals - Tensor of shape (num_proposals, 4)
+
+ returns:
+ array - shape (num_proposals,) containing IoU of reference box with each proposal box.
+ """
+
+ # Intersection box
+ tl = np.maximum(reference[:, :2], proposals[:, :2])
+ br = np.minimum(reference[:, :2] + reference[:, 2:],
+ proposals[:, :2] + proposals[:, 2:])
+ sz = np.clip(br - tl, 0, np.inf)
+
+ # Area
+ intersection = np.prod(sz, axis=1)
+ union = np.prod(
+ reference[:, 2:], axis=1) + np.prod(
+ proposals[:, 2:], axis=1) - intersection
+
+ return intersection / union
+
+
+def rand_uniform(a, b, rng=None, shape=1):
+ """ sample numbers uniformly between a and b.
+ args:
+ a - lower bound
+ b - upper bound
+ shape - shape of the output tensor
+
+ returns:
+ array
+ """
+ rand = np.random.rand if rng is None else rng.rand
+ return (b - a) * rand(shape) + a
+
+
+def perturb_box(box, min_iou=0.5, sigma_factor=0.1, rng=None):
+ """ Perturb the input box by adding gaussian noise to the co-ordinates
+
+ args:
+ box - input box
+ min_iou - minimum IoU overlap between input box and the perturbed box
+ sigma_factor - amount of perturbation, relative to the box size. Can be either a single element, or a list of
+ sigma_factors, in which case one of them will be uniformly sampled. Further, each of the
+ sigma_factor element can be either a float, or a tensor
+ of shape (4,) specifying the sigma_factor per co-ordinate
+
+ returns:
+ array - the perturbed box
+ """
+ if rng is None:
+ rng = np.random
+
+ if isinstance(sigma_factor, list):
+ # If list, sample one sigma_factor as current sigma factor
+ c_sigma_factor = rng.choice(sigma_factor)
+ else:
+ c_sigma_factor = sigma_factor
+
+ if not isinstance(c_sigma_factor, np.ndarray):
+ c_sigma_factor = c_sigma_factor * np.ones(4)
+
+ perturb_factor = np.sqrt(box[2] * box[3]) * c_sigma_factor
+
+ # multiple tries to ensure that the perturbed box has iou > min_iou with the input box
+ for i_ in range(100):
+ c_x = box[0] + 0.5 * box[2]
+ c_y = box[1] + 0.5 * box[3]
+ c_x_per = rng.normal(c_x, perturb_factor[0])
+ c_y_per = rng.normal(c_y, perturb_factor[1])
+
+ w_per = rng.normal(box[2], perturb_factor[2])
+ h_per = rng.normal(box[3], perturb_factor[3])
+
+ if w_per <= 1:
+ w_per = box[2] * rand_uniform(0.15, 0.5, rng)[0]
+
+ if h_per <= 1:
+ h_per = box[3] * rand_uniform(0.15, 0.5, rng)[0]
+
+ box_per = np.round(
+ np.array(
+ [c_x_per - 0.5 * w_per, c_y_per - 0.5 * h_per, w_per, h_per]))
+
+ if box_per[2] <= 1:
+ box_per[2] = box[2] * rand_uniform(0.15, 0.5, rng)
+
+ if box_per[3] <= 1:
+ box_per[3] = box[3] * rand_uniform(0.15, 0.5, rng)
+
+ box_iou = iou(np.reshape(box, (1, 4)), np.reshape(box_per, (1, 4)))
+
+ # if there is sufficient overlap, return
+ if box_iou > min_iou:
+ return box_per, box_iou
+
+ # else reduce the perturb factor
+ perturb_factor *= 0.9
+
+ return box_per, box_iou
diff --git a/PaddleCV/tracking/ltr/data/sampler.py b/PaddleCV/tracking/ltr/data/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..064c604dfe14c71cdba21aea79d73af45d38f317
--- /dev/null
+++ b/PaddleCV/tracking/ltr/data/sampler.py
@@ -0,0 +1,180 @@
+import numpy as np
+import dataflow as df
+from pytracking.libs import TensorDict
+
+
+def no_processing(data, rng=None):
+ return data
+
+
+class ATOMSampler(df.RNGDataFlow):
+ """ Class responsible for sampling frames from training sequences to form batches. Each training sample is a
+ tuple consisting of i) a train frame, used to obtain the modulation vector, and ii) a set of test frames on which
+ the IoU prediction loss is calculated.
+
+ The sampling is done in the following ways. First a dataset is selected at random. Next, a sequence is selected
+ from that dataset. A 'train frame' is then sampled randomly from the sequence. Next, depending on the
+ frame_sample_mode, the required number of test frames are sampled randomly, either from the range
+ [train_frame_id - max_gap, train_frame_id + max_gap] in the 'default' mode, or from [train_frame_id, train_frame_id + max_gap]
+ in the 'causal' mode. Only the frames in which the target is visible are sampled, and if enough visible frames are
+ not found, the 'max_gap' is incremented.
+
+ The sampled frames are then passed through the input 'processing' function for the necessary processing-
+ """
+
+ def __init__(self,
+ datasets,
+ p_datasets,
+ samples_per_epoch,
+ max_gap,
+ num_test_frames=1,
+ processing=no_processing,
+ frame_sample_mode='default'):
+ """
+ args:
+ datasets - List of datasets to be used for training
+ p_datasets - List containing the probabilities by which each dataset will be sampled
+ samples_per_epoch - Number of training samples per epoch
+ max_gap - Maximum gap, in frame numbers, between the train (reference) frame and the test frames.
+ num_test_frames - Number of test frames used for calculating the IoU prediction loss.
+ processing - An instance of Processing class which performs the necessary processing of the data.
+ frame_sample_mode - Either 'default' or 'causal'. If 'causal', then the test frames are sampled in a causal
+ manner.
+ """
+ self.datasets = datasets
+
+ # If p not provided, sample uniformly from all videos
+ if p_datasets is None:
+ p_datasets = [1 for d in self.datasets]
+
+ # Normalize
+ p_total = sum(p_datasets)
+ self.p_datasets = [x / p_total for x in p_datasets]
+
+ self.samples_per_epoch = samples_per_epoch
+ self.max_gap = max_gap
+ self.num_test_frames = num_test_frames
+ self.num_train_frames = 1 # Only a single train frame allowed
+ self.processing = processing
+ self.frame_sample_mode = frame_sample_mode
+
+ def __len__(self):
+ return self.samples_per_epoch
+
+ def _sample_visible_ids(self, visible, num_ids=1, min_id=None, max_id=None):
+ """ Samples num_ids frames between min_id and max_id for which target is visible
+
+ args:
+ visible - 1d Tensor indicating whether target is visible for each frame
+ num_ids - number of frames to be samples
+ min_id - Minimum allowed frame number
+ max_id - Maximum allowed frame number
+
+ returns:
+ list - List of sampled frame numbers. None if not sufficient visible frames could be found.
+ """
+ if min_id is None or min_id < 0:
+ min_id = 0
+ if max_id is None or max_id > len(visible):
+ max_id = len(visible)
+
+ valid_ids = [i for i in range(min_id, max_id) if visible[i]]
+
+ # No visible ids
+ if len(valid_ids) == 0:
+ return None
+
+ inds = self.rng.choice(
+ range(len(valid_ids)), size=num_ids, replace=True)
+ ids = [valid_ids[ii] for ii in inds]
+ # return random.choices(valid_ids, k=num_ids)
+ return ids
+
+ def __iter__(self):
+ """
+ args:
+ index (int): Index (Ignored since we sample randomly)
+
+ returns:
+ TensorDict - dict containing all the data blocks
+ """
+
+ # Select a dataset
+ # dataset = self.rng.choices(self.datasets, self.p_datasets)[0]
+ dataset_idx = self.rng.choice(
+ range(len(self.datasets)), p=self.p_datasets, replace=False)
+ dataset = self.datasets[dataset_idx]
+ is_video_dataset = dataset.is_video_sequence()
+
+ min_visible_frames = 2 * (self.num_test_frames + self.num_train_frames)
+ enough_visible_frames = False
+
+ # Sample a sequence with enough visible frames and get anno for the same
+ while not enough_visible_frames:
+ seq_id = self.rng.randint(0, dataset.get_num_sequences() - 1)
+ anno, visible = dataset.get_sequence_info(seq_id)
+ num_visible = np.sum(visible.astype('int64'))
+ enough_visible_frames = not is_video_dataset or (
+ num_visible > min_visible_frames and len(visible) >= 20)
+
+ if is_video_dataset:
+ train_frame_ids = None
+ test_frame_ids = None
+ gap_increase = 0
+ if self.frame_sample_mode == 'default':
+ # Sample frame numbers
+ while test_frame_ids is None:
+ train_frame_ids = self._sample_visible_ids(
+ visible, num_ids=self.num_train_frames)
+ test_frame_ids = self._sample_visible_ids(
+ visible,
+ min_id=train_frame_ids[0] - self.max_gap - gap_increase,
+ max_id=train_frame_ids[0] + self.max_gap + gap_increase,
+ num_ids=self.num_test_frames)
+ gap_increase += 5 # Increase gap until a frame is found
+ elif self.frame_sample_mode == 'causal':
+ # Sample frame numbers in a causal manner, i.e. test_frame_ids > train_frame_ids
+ while test_frame_ids is None:
+ base_frame_id = self._sample_visible_ids(
+ visible,
+ num_ids=1,
+ min_id=self.num_train_frames - 1,
+ max_id=len(visible) - self.num_test_frames)
+ prev_frame_ids = self._sample_visible_ids(
+ visible,
+ num_ids=self.num_train_frames - 1,
+ min_id=base_frame_id[0] - self.max_gap - gap_increase,
+ max_id=base_frame_id[0])
+ if prev_frame_ids is None:
+ gap_increase += 5
+ continue
+ train_frame_ids = base_frame_id + prev_frame_ids
+ test_frame_ids = self._sample_visible_ids(
+ visible,
+ min_id=train_frame_ids[0] + 1,
+ max_id=train_frame_ids[0] + self.max_gap + gap_increase,
+ num_ids=self.num_test_frames)
+ gap_increase += 5 # Increase gap until a frame is found
+ else:
+ raise ValueError('Unknown frame_sample_mode.')
+ else:
+ train_frame_ids = [1] * self.num_train_frames
+ test_frame_ids = [1] * self.num_test_frames
+
+ # Get frames
+ train_frames, train_anno, _ = dataset.get_frames(seq_id,
+ train_frame_ids, anno)
+ test_frames, test_anno, _ = dataset.get_frames(seq_id, test_frame_ids,
+ anno)
+
+ # Prepare data
+ data = TensorDict({
+ 'train_images': train_frames,
+ 'train_anno': train_anno,
+ 'test_images': test_frames,
+ 'test_anno': test_anno,
+ 'dataset': dataset.get_name()
+ })
+
+ # Send for processing
+ yield self.processing(data, rng=self.rng)
diff --git a/PaddleCV/tracking/ltr/data/transforms.py b/PaddleCV/tracking/ltr/data/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..83c6e3611119a757612377bea6e252049f19f9fe
--- /dev/null
+++ b/PaddleCV/tracking/ltr/data/transforms.py
@@ -0,0 +1,148 @@
+import random
+import numpy as np
+import math
+import cv2 as cv
+from paddle.fluid import layers
+from pytracking.libs.paddle_utils import PTensor
+
+
+class Transform:
+ """ Class for applying various image transformations."""
+
+ def __call__(self, *args):
+ rand_params = self.roll()
+ if rand_params is None:
+ rand_params = ()
+ elif not isinstance(rand_params, tuple):
+ rand_params = (rand_params, )
+ output = [self.transform(img, *rand_params) for img in args]
+ if len(output) == 1:
+ return output[0]
+ return output
+
+ def roll(self):
+ return None
+
+ def transform(self, img, *args):
+ """Must be deterministic"""
+ raise NotImplementedError
+
+
+class Compose:
+ """Composes several transforms together.
+
+ Args:
+ transforms (list of ``Transform`` objects): list of transforms to compose.
+ """
+
+ def __init__(self, transforms):
+ self.transforms = transforms
+
+ def __call__(self, *args):
+ for t in self.transforms:
+ if not isinstance(args, tuple):
+ args = (args, )
+ args = t(*args)
+ return args
+
+ def __repr__(self):
+ format_string = self.__class__.__name__ + '('
+ for t in self.transforms:
+ format_string += '\n'
+ format_string += ' {0}'.format(t)
+ format_string += '\n)'
+ return format_string
+
+
+class Normalize(object):
+ """Normalize an tensor image with mean and standard deviation.
+ Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels, this transform
+ will normalize each channel of the input i.e.
+ ``input[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+ Args:
+ mean (sequence): Sequence of means for each channel.
+ std (sequence): Sequence of standard deviations for each channel.
+ """
+
+ def __init__(self, mean, std):
+ self.mean = np.reshape(mean, [-1, 1, 1])
+ self.std = np.reshape(std, [-1, 1, 1])
+
+ def __call__(self, tensor):
+ """
+ Args:
+ tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
+
+ Returns:
+ Tensor: Normalized Tensor image.
+ """
+ return (tensor - self.mean) / self.std
+
+
+class ToArray(Transform):
+ """ Transpose image and jitter brightness"""
+
+ def __init__(self, brightness_jitter=0.0):
+ self.brightness_jitter = brightness_jitter
+
+ def __call__(self, img):
+ img = img.transpose((2, 0, 1))
+ return img.astype('float32') / 255.
+
+
+class ToArrayAndJitter(Transform):
+ """ Transpose image and jitter brightness"""
+
+ def __init__(self, brightness_jitter=0.0):
+ self.brightness_jitter = brightness_jitter
+
+ def roll(self):
+ return np.random.uniform(
+ max(0, 1 - self.brightness_jitter), 1 + self.brightness_jitter)
+
+ def transform(self, img, brightness_factor):
+ # handle numpy array
+ img = img.transpose((2, 0, 1))
+
+ # backward compatibility
+ return np.clip(
+ img.astype('float32') * brightness_factor / 255.0, 0.0, 1.0)
+
+
+class ToGrayscale(Transform):
+ """Converts image to grayscale with probability"""
+
+ def __init__(self, probability=0.5):
+ self.probability = probability
+ self.color_weights = np.array(
+ [0.2989, 0.5870, 0.1140], dtype=np.float32)
+
+ def roll(self):
+ return random.random() < self.probability
+
+ def transform(self, img, do_grayscale):
+ if do_grayscale:
+ if isinstance(img, PTensor):
+ raise NotImplementedError('Implement paddle variant.')
+ img_gray = cv.cvtColor(img, cv.COLOR_RGB2GRAY)
+ return np.stack([img_gray, img_gray, img_gray], axis=2)
+ # return np.repeat(np.sum(img * self.color_weights, axis=2, keepdims=True).astype(np.uint8), 3, axis=2)
+ return img
+
+
+class RandomHorizontalFlip(Transform):
+ """Horizontally flip the given NumPy Image randomly with a probability p."""
+
+ def __init__(self, probability=0.5):
+ self.probability = probability
+
+ def roll(self):
+ return random.random() < self.probability
+
+ def transform(self, img, do_flip):
+ if do_flip:
+ if isinstance(img, PTensor):
+ return layers.reverse(img, 2)
+ return np.fliplr(img).copy()
+ return img
diff --git a/PaddleCV/tracking/ltr/dataset/__init__.py b/PaddleCV/tracking/ltr/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..330cd163c602b0eecbbf67b0758959bc179a7442
--- /dev/null
+++ b/PaddleCV/tracking/ltr/dataset/__init__.py
@@ -0,0 +1,8 @@
+from .lasot import Lasot
+from .got10k import Got10k
+from .tracking_net import TrackingNet
+from .imagenetvid import ImagenetVID
+from .coco_seq import MSCOCOSeq
+from .vot import VOT
+from .youtube_vos import VOS
+from .youtube_bb import YoutubeBB
diff --git a/PaddleCV/tracking/ltr/dataset/base_dataset.py b/PaddleCV/tracking/ltr/dataset/base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..34129284ae19b431548b91c9756099c689d2aead
--- /dev/null
+++ b/PaddleCV/tracking/ltr/dataset/base_dataset.py
@@ -0,0 +1,85 @@
+from ltr.data.image_loader import default_image_loader
+
+
+class BaseDataset(object):
+ """ Base class for datasets """
+
+ def __init__(self, root, image_loader=default_image_loader):
+ """
+ args:
+ root - The root path to the dataset
+ image_loader (jpeg4py_loader) - The function to read the images. jpeg4py (https://github.com/ajkxyz/jpeg4py)
+ is used by default.
+ """
+ if root == '':
+ raise Exception(
+ 'The dataset path is not setup. Check your "ltr/admin/local.py".'
+ )
+ self.root = root
+ self.image_loader = image_loader
+
+ self.sequence_list = [] # Contains the list of sequences.
+
+ def __len__(self):
+ """ Returns size of the dataset
+ returns:
+ int - number of samples in the dataset
+ """
+ return self.get_num_sequences()
+
+ def __getitem__(self, index):
+ """ Not to be used! Check get_frames() instead.
+ """
+ return None
+
+ def is_video_sequence(self):
+ """ Returns whether the dataset is a video dataset or an image dataset
+
+ returns:
+ bool - True if a video dataset
+ """
+ return True
+
+ def get_name(self):
+ """ Name of the dataset
+
+ returns:
+ string - Name of the dataset
+ """
+ raise NotImplementedError
+
+ def get_num_sequences(self):
+ """ Number of sequences in a dataset
+
+ returns:
+ int - number of sequences in the dataset."""
+ return len(self.sequence_list)
+
+ def get_sequence_info(self, seq_id):
+ """ Returns information about a particular sequences,
+
+ args:
+ seq_id - index of the sequence
+
+ returns:
+ Tensor - Annotation for the sequence. A 2d tensor of shape (num_frames, 4).
+ Format [top_left_x, top_left_y, width, height]
+ Tensor - 1d Tensor specifying whether target is present (=1 )for each frame. shape (num_frames,)
+ """
+ raise NotImplementedError
+
+ def get_frames(self, seq_id, frame_ids, anno=None):
+ """ Get a set of frames from a particular sequence
+
+ args:
+ seq_id - index of sequence
+ frame_ids - a list of frame numbers
+ anno(None) - The annotation for the sequence (see get_sequence_info). If None, they will be loaded.
+
+ returns:
+ list - List of frames corresponding to frame_ids
+ list - List of annotations (tensor of shape (4,)) for each frame
+ dict - A dict containing meta information about the sequence, e.g. class of the target object.
+
+ """
+ raise NotImplementedError
diff --git a/PaddleCV/tracking/ltr/dataset/coco_seq.py b/PaddleCV/tracking/ltr/dataset/coco_seq.py
new file mode 100644
index 0000000000000000000000000000000000000000..d55442944dfda2417c50c10fc35c0237b8cc22c8
--- /dev/null
+++ b/PaddleCV/tracking/ltr/dataset/coco_seq.py
@@ -0,0 +1,130 @@
+import os
+from .base_dataset import BaseDataset
+from ltr.data.image_loader import default_image_loader
+from pycocotools.coco import COCO
+from collections import OrderedDict
+from ltr.admin.environment import env_settings
+import numpy as np
+
+
+class MSCOCOSeq(BaseDataset):
+ """ The COCO dataset. COCO is an image dataset. Thus, we treat each image as a sequence of length 1.
+
+ Publication:
+ Microsoft COCO: Common Objects in Context.
+ Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona,
+ Deva Ramanan, Piotr Dollar and C. Lawrence Zitnick
+ ECCV, 2014
+ https://arxiv.org/pdf/1405.0312.pdf
+
+ Download the images along with annotations from http://cocodataset.org/#download. The root folder should be
+ organized as follows.
+ - coco_root
+ - annotations
+ - instances_train2014.json
+ - images
+ - train2014
+
+ Note: You also have to install the coco pythonAPI from https://github.com/cocodataset/cocoapi.
+ """
+
+ def __init__(self,
+ root=None,
+ filter=None,
+ image_loader=default_image_loader):
+ root = env_settings().coco_dir if root is None else root
+ super().__init__(root, image_loader)
+ self.filter = filter
+
+ # self.img_pth = os.path.join(root, 'train2014/')
+ self.img_pth = os.path.join(root, 'train2017/')
+ # self.anno_path = os.path.join(root, 'annotations/instances_train2014.json')
+ self.anno_path = os.path.join(root,
+ 'annotations/instances_train2017.json')
+
+ # Load the COCO set.
+ self.coco_set = COCO(self.anno_path)
+
+ self.cats = self.coco_set.cats
+ self.sequence_list = self._get_sequence_list()
+
+ def _get_sequence_list(self):
+ ann_list = list(self.coco_set.anns.keys())
+ seq_list = []
+ print('COCO before: {}'.format(len(ann_list)))
+ for a in ann_list:
+ if self.coco_set.anns[a]['iscrowd'] == 0:
+ box = self.coco_set.anns[a]['bbox']
+ box = np.reshape(np.array(box), (1, 4))
+ target_visible = (box[:, 2] > 0) & (box[:, 3] > 0)
+ if self.filter:
+ target_large = (box[:, 2] * box[:, 3] > 30 * 30)
+ ratio = box[:, 2] / box[:, 3]
+ target_reasonable_ratio = (10 > ratio) & (ratio > 0.1)
+ target_visible = target_visible & target_large & target_reasonable_ratio
+ if target_visible:
+ seq_list.append(a)
+ print('COCO after: {}'.format(len(seq_list)))
+ return seq_list
+
+ def is_video_sequence(self):
+ return False
+
+ def get_name(self):
+ return 'coco'
+
+ def get_num_sequences(self):
+ return len(self.sequence_list)
+
+ def get_sequence_info(self, seq_id):
+ anno = self._get_anno(seq_id)
+ target_visible = (anno[:, 2] > 0) & (anno[:, 3] > 0)
+ return anno, target_visible
+
+ def _get_anno(self, seq_id):
+ anno = self.coco_set.anns[self.sequence_list[seq_id]]['bbox']
+ return np.reshape(np.array(anno), (1, 4))
+
+ def _get_frames(self, seq_id):
+ path = self.coco_set.loadImgs(
+ [self.coco_set.anns[self.sequence_list[seq_id]]['image_id']])[0][
+ 'file_name']
+ img = self.image_loader(os.path.join(self.img_pth, path))
+ return img
+
+ def get_meta_info(self, seq_id):
+ try:
+ cat_dict_current = self.cats[self.coco_set.anns[self.sequence_list[
+ seq_id]]['category_id']]
+ object_meta = OrderedDict({
+ 'object_class': cat_dict_current['name'],
+ 'motion_class': None,
+ 'major_class': cat_dict_current['supercategory'],
+ 'root_class': None,
+ 'motion_adverb': None
+ })
+ except:
+ object_meta = OrderedDict({
+ 'object_class': None,
+ 'motion_class': None,
+ 'major_class': None,
+ 'root_class': None,
+ 'motion_adverb': None
+ })
+ return object_meta
+
+ def get_frames(self, seq_id=None, frame_ids=None, anno=None):
+ # COCO is an image dataset. Thus we replicate the image denoted by seq_id len(frame_ids) times, and return a
+ # list containing these replicated images.
+ frame = self._get_frames(seq_id)
+
+ frame_list = [frame.copy() for _ in frame_ids]
+
+ if anno is None:
+ anno = self._get_anno(seq_id)
+
+ anno_frames = [anno.copy()[0, :] for _ in frame_ids]
+
+ object_meta = self.get_meta_info(seq_id)
+
+ return frame_list, anno_frames, object_meta
diff --git a/PaddleCV/tracking/ltr/dataset/got10k.py b/PaddleCV/tracking/ltr/dataset/got10k.py
new file mode 100644
index 0000000000000000000000000000000000000000..986e2cf2f8098484eae1afaa2ea2574d41b9bc98
--- /dev/null
+++ b/PaddleCV/tracking/ltr/dataset/got10k.py
@@ -0,0 +1,183 @@
+import os
+import os.path
+import numpy as np
+import csv
+import pandas
+from collections import OrderedDict
+from .base_dataset import BaseDataset
+from ltr.data.image_loader import default_image_loader
+from ltr.admin.environment import env_settings
+
+
+class Got10k(BaseDataset):
+ """ GOT-10k dataset.
+
+ Publication:
+ GOT-10k: A Large High-Diversity Benchmark for Generic Object Tracking in the Wild
+ Lianghua Huang, Xin Zhao, and Kaiqi Huang
+ arXiv:1810.11981, 2018
+ https://arxiv.org/pdf/1810.11981.pdf
+
+ Download dataset from http://got-10k.aitestunion.com/downloads
+ """
+
+ def __init__(self,
+ root=None,
+ filter=None,
+ image_loader=default_image_loader,
+ split=None,
+ seq_ids=None):
+ """
+ args:
+ root - path to the got-10k training data. Note: This should point to the 'train' folder inside GOT-10k
+ image_loader (jpeg4py_loader) - The function to read the images. jpeg4py (https://github.com/ajkxyz/jpeg4py)
+ is used by default.
+ split - 'train' or 'val'. Note: The validation split here is a subset of the official got-10k train split,
+ not NOT the official got-10k validation split. To use the official validation split, provide that as
+ the root folder instead.
+ seq_ids - List containing the ids of the videos to be used for training. Note: Only one of 'split' or 'seq_ids'
+ options can be used at the same time.
+ """
+ root = env_settings().got10k_dir if root is None else root
+ super().__init__(root, image_loader)
+
+ # all folders inside the root
+ self.sequence_list = self._get_sequence_list()
+
+ if split == 'vot-train':
+ ltr_path = os.path.join(
+ os.path.dirname(os.path.realpath(__file__)), '..')
+ with open(
+ os.path.join(ltr_path, 'data_specs',
+ 'got10k_prohibited_for_VOT.txt')) as f:
+ prohibited = [l.strip() for l in f.readlines()]
+ print('GOT10K before: {}'.format(len(self.sequence_list)))
+ self.sequence_list = [
+ x for x in self.sequence_list if x not in prohibited
+ ]
+ print('GOT10K after: {}'.format(len(self.sequence_list)))
+ else:
+ # seq_id is the index of the folder inside the got10k root path
+ if split is not None:
+ if seq_ids is not None:
+ raise ValueError('Cannot set both split_name and seq_ids.')
+ ltr_path = os.path.join(
+ os.path.dirname(os.path.realpath(__file__)), '..')
+ if split == 'train':
+ file_path = os.path.join(ltr_path, 'data_specs',
+ 'got10k_train_split.txt')
+ elif split == 'val':
+ file_path = os.path.join(ltr_path, 'data_specs',
+ 'got10k_val_split.txt')
+ else:
+ raise ValueError('Unknown split name.')
+ seq_ids = pandas.read_csv(
+ file_path, header=None, squeeze=True,
+ dtype=np.int64).values.tolist()
+ elif seq_ids is None:
+ seq_ids = list(range(0, len(self.sequence_list)))
+ # self.seq_ids = seq_ids
+
+ self.sequence_list = [self.sequence_list[i] for i in seq_ids]
+
+ self.sequence_meta_info = self._load_meta_info()
+ self.filter = filter
+
+ def get_name(self):
+ return 'got10k'
+
+ def _load_meta_info(self):
+ sequence_meta_info = {
+ s: self._read_meta(os.path.join(self.root, s))
+ for s in self.sequence_list
+ }
+ return sequence_meta_info
+
+ def _read_meta(self, seq_path):
+ try:
+ with open(os.path.join(seq_path, 'meta_info.ini')) as f:
+ meta_info = f.readlines()
+ object_meta = OrderedDict({
+ 'object_class': meta_info[5].split(': ')[-1][:-1],
+ 'motion_class': meta_info[6].split(': ')[-1][:-1],
+ 'major_class': meta_info[7].split(': ')[-1][:-1],
+ 'root_class': meta_info[8].split(': ')[-1][:-1],
+ 'motion_adverb': meta_info[9].split(': ')[-1][:-1]
+ })
+ except:
+ object_meta = OrderedDict({
+ 'object_class': None,
+ 'motion_class': None,
+ 'major_class': None,
+ 'root_class': None,
+ 'motion_adverb': None
+ })
+ return object_meta
+
+ def _get_sequence_list(self):
+ with open(os.path.join(self.root, 'list.txt')) as f:
+ # dir_names = f.readlines()
+ dir_list = list(csv.reader(f))
+ dir_list = [dir_name[0] for dir_name in dir_list]
+ return dir_list
+
+ def _read_anno(self, seq_path):
+ anno_file = os.path.join(seq_path, "groundtruth.txt")
+ gt = pandas.read_csv(
+ anno_file,
+ delimiter=',',
+ header=None,
+ dtype=np.float32,
+ na_filter=False,
+ low_memory=False).values
+ return np.array(gt)
+
+ def _read_target_visible(self, seq_path, anno):
+ # Read full occlusion and out_of_view
+ occlusion_file = os.path.join(seq_path, "absence.label")
+ cover_file = os.path.join(seq_path, "cover.label")
+
+ with open(occlusion_file, 'r', newline='') as f:
+ occlusion = np.array([int(v[0]) for v in csv.reader(f)], 'byte')
+ with open(cover_file, 'r', newline='') as f:
+ cover = np.array([int(v[0]) for v in csv.reader(f)], 'byte')
+
+ target_visible = ~occlusion & (cover > 0) & (anno[:, 2] > 0) & (
+ anno[:, 3] > 0)
+
+ return target_visible
+
+ def _get_sequence_path(self, seq_id):
+ return os.path.join(self.root, self.sequence_list[seq_id])
+
+ def get_sequence_info(self, seq_id):
+ seq_path = self._get_sequence_path(seq_id)
+ anno = self._read_anno(seq_path)
+ target_visible = self._read_target_visible(seq_path, anno)
+ if self.filter:
+ target_large = (anno[:, 2] * anno[:, 3] > 30 * 30)
+ ratio = anno[:, 2] / anno[:, 3]
+ target_reasonable_ratio = (10 > ratio) & (ratio > 0.1)
+ target_visible = target_visible & target_large & target_reasonable_ratio
+ return anno, target_visible
+
+ def _get_frame_path(self, seq_path, frame_id):
+ return os.path.join(
+ seq_path, '{:08}.jpg'.format(frame_id + 1)) # frames start from 1
+
+ def _get_frame(self, seq_path, frame_id):
+ return self.image_loader(self._get_frame_path(seq_path, frame_id))
+
+ def get_frames(self, seq_id, frame_ids, anno=None):
+ seq_path = self._get_sequence_path(seq_id)
+ obj_meta = self.sequence_meta_info[self.sequence_list[seq_id]]
+
+ frame_list = [self._get_frame(seq_path, f_id) for f_id in frame_ids]
+
+ if anno is None:
+ anno = self._read_anno(seq_path)
+
+ # Return as list of tensors
+ anno_frames = [anno[f_id, :] for f_id in frame_ids]
+
+ return frame_list, anno_frames, obj_meta
diff --git a/PaddleCV/tracking/ltr/dataset/imagenetvid.py b/PaddleCV/tracking/ltr/dataset/imagenetvid.py
new file mode 100644
index 0000000000000000000000000000000000000000..15b5ae0a70996a22944ea68946b81e492789ccd2
--- /dev/null
+++ b/PaddleCV/tracking/ltr/dataset/imagenetvid.py
@@ -0,0 +1,201 @@
+import os
+import numpy as np
+from .base_dataset import BaseDataset
+from ltr.data.image_loader import default_image_loader
+import xml.etree.ElementTree as ET
+import json
+from collections import OrderedDict
+import nltk
+from nltk.corpus import wordnet
+from ltr.admin.environment import env_settings
+
+
+def get_target_to_image_ratio(seq):
+ anno = np.array(seq['anno'])
+ img_sz = np.array(seq['image_size'])
+ return np.sqrt(anno[0, 2:4].prod() / (img_sz.prod()))
+
+
+class ImagenetVID(BaseDataset):
+ """ Imagenet VID dataset.
+
+ Publication:
+ ImageNet Large Scale Visual Recognition Challenge
+ Olga Russakovsky, Jia Deng, Hao Su, Jonathan Krause, Sanjeev Satheesh, Sean Ma, Zhiheng Huang, Andrej Karpathy,
+ Aditya Khosla, Michael Bernstein, Alexander C. Berg and Li Fei-Fei
+ IJCV, 2015
+ https://arxiv.org/pdf/1409.0575.pdf
+
+ Download the dataset from http://image-net.org/
+ """
+
+ def __init__(self,
+ root=None,
+ filter=None,
+ image_loader=default_image_loader,
+ min_length=0,
+ max_target_area=1):
+ """
+ args:
+ root - path to the imagenet vid dataset.
+ image_loader (jpeg4py_loader) - The function to read the images. jpeg4py (https://github.com/ajkxyz/jpeg4py)
+ is used by default.
+ min_length - Minimum allowed sequence length.
+ max_target_area - max allowed ratio between target area and image area. Can be used to filter out targets
+ which cover complete image.
+ """
+ root = env_settings().imagenet_dir if root is None else root
+ super().__init__(root, image_loader)
+
+ cache_file = os.path.join(root, 'cache.json')
+ if os.path.isfile(cache_file):
+ # If available, load the pre-processed cache file containing meta-info for each sequence
+ with open(cache_file, 'r') as f:
+ sequence_list_dict = json.load(f)
+
+ self.sequence_list = sequence_list_dict
+ else:
+ # Else process the imagenet annotations and generate the cache file
+ self.sequence_list = self._process_anno(root)
+
+ with open(cache_file, 'w') as f:
+ json.dump(self.sequence_list, f)
+
+ # Filter the sequences based on min_length and max_target_area in the first frame
+ self.sequence_list = [
+ x for x in self.sequence_list
+ if len(x['anno']) >= min_length and get_target_to_image_ratio(x) <
+ max_target_area
+ ]
+ self.filter = filter
+
+ def get_name(self):
+ return 'imagenetvid'
+
+ def get_num_sequences(self):
+ return len(self.sequence_list)
+
+ def get_sequence_info(self, seq_id):
+ anno = np.array(self.sequence_list[seq_id]['anno'])
+ target_visible = np.array(self.sequence_list[seq_id]['target_visible'],
+ 'bool')
+ target_visible = target_visible & (anno[:, 2] > 0) & (anno[:, 3] > 0)
+ if self.filter is not None:
+ target_large = (anno[:, 2] * anno[:, 3] > 30 * 30)
+ ratio = anno[:, 2] / anno[:, 3]
+ target_reasonable_ratio = (10 > ratio) & (ratio > 0.1)
+ target_visible = target_visible & target_reasonable_ratio & target_large
+ return anno, target_visible
+
+ def _get_frame(self, sequence, frame_id):
+ set_name = 'ILSVRC2015_VID_train_{:04d}'.format(sequence['set_id'])
+ vid_name = 'ILSVRC2015_train_{:08d}'.format(sequence['vid_id'])
+ frame_number = frame_id + sequence['start_frame']
+
+ frame_path = os.path.join(self.root, 'Data', 'VID', 'train', set_name,
+ vid_name, '{:06d}.JPEG'.format(frame_number))
+ # frame_path = os.path.join(self.root, 'Data', 'VID', 'train', vid_name,
+ # '{:06d}.jpg'.format(frame_number))
+ return self.image_loader(frame_path)
+
+ def get_frames(self, seq_id, frame_ids, anno=None):
+ sequence = self.sequence_list[seq_id]
+
+ frame_list = [self._get_frame(sequence, f) for f in frame_ids]
+
+ if anno is None:
+ anno = sequence['anno']
+
+ # Return as list of tensors
+ anno_frames = [anno[f_id, :] for f_id in frame_ids]
+
+ # added the class info to the meta info
+ object_meta = OrderedDict({
+ 'object_class': sequence['class_name'],
+ 'motion_class': None,
+ 'major_class': None,
+ 'root_class': None,
+ 'motion_adverb': None
+ })
+
+ return frame_list, anno_frames, object_meta
+
+ def _process_anno(self, root):
+ # Builds individual tracklets
+ base_vid_anno_path = os.path.join(root, 'Annotations', 'VID', 'train')
+
+ all_sequences = []
+ # for set in sorted(os.listdir(base_vid_anno_path)):
+ for set in sorted([
+ 'ILSVRC2015_VID_train_0000', 'ILSVRC2015_VID_train_0001',
+ 'ILSVRC2015_VID_train_0002', 'ILSVRC2015_VID_train_0003'
+ ]):
+ set_id = int(set.split('_')[-1])
+ for vid in sorted(
+ os.listdir(os.path.join(base_vid_anno_path, set))):
+
+ vid_id = int(vid.split('_')[-1])
+ anno_files = sorted(
+ os.listdir(os.path.join(base_vid_anno_path, set, vid)))
+
+ frame1_anno = ET.parse(
+ os.path.join(base_vid_anno_path, set, vid, anno_files[0]))
+ image_size = [
+ int(frame1_anno.find('size/width').text),
+ int(frame1_anno.find('size/height').text)
+ ]
+
+ objects = [
+ ET.ElementTree(file=os.path.join(base_vid_anno_path, set,
+ vid, f)).findall('object')
+ for f in anno_files
+ ]
+
+ tracklets = {}
+
+ # Find all tracklets along with start frame
+ for f_id, all_targets in enumerate(objects):
+ for target in all_targets:
+ tracklet_id = target.find('trackid').text
+ if tracklet_id not in tracklets:
+ tracklets[tracklet_id] = f_id
+
+ for tracklet_id, tracklet_start in tracklets.items():
+ tracklet_anno = []
+ target_visible = []
+ class_name = None
+
+ for f_id in range(tracklet_start, len(objects)):
+ found = False
+ for target in objects[f_id]:
+ if target.find('trackid').text == tracklet_id:
+ if not class_name:
+ class_name_id = target.find('name').text
+ class_name = class_name_id
+ # class_name = self._get_class_name_from_id(class_name_id)
+ x1 = int(target.find('bndbox/xmin').text)
+ y1 = int(target.find('bndbox/ymin').text)
+ x2 = int(target.find('bndbox/xmax').text)
+ y2 = int(target.find('bndbox/ymax').text)
+
+ tracklet_anno.append([x1, y1, x2 - x1, y2 - y1])
+ target_visible.append(
+ target.find('occluded').text == '0')
+
+ found = True
+ break
+ if not found:
+ break
+
+ new_sequence = {
+ 'set_id': set_id,
+ 'vid_id': vid_id,
+ 'class_name': class_name,
+ 'start_frame': tracklet_start,
+ 'anno': tracklet_anno,
+ 'target_visible': target_visible,
+ 'image_size': image_size
+ }
+ all_sequences.append(new_sequence)
+
+ return all_sequences
diff --git a/PaddleCV/tracking/ltr/dataset/lasot.py b/PaddleCV/tracking/ltr/dataset/lasot.py
new file mode 100644
index 0000000000000000000000000000000000000000..11c5a3f173fed4b38dc610cd5c936c65ef532a1a
--- /dev/null
+++ b/PaddleCV/tracking/ltr/dataset/lasot.py
@@ -0,0 +1,152 @@
+import os
+import os.path
+import numpy as np
+import pandas
+import csv
+from collections import OrderedDict
+from .base_dataset import BaseDataset
+from ltr.data.image_loader import default_image_loader
+from ltr.admin.environment import env_settings
+
+
+class Lasot(BaseDataset):
+ """ LaSOT dataset.
+
+ Publication:
+ LaSOT: A High-quality Benchmark for Large-scale Single Object Tracking
+ Heng Fan, Liting Lin, Fan Yang, Peng Chu, Ge Deng, Sijia Yu, Hexin Bai, Yong Xu, Chunyuan Liao and Haibin Ling
+ CVPR, 2019
+ https://arxiv.org/pdf/1809.07845.pdf
+
+ Download the dataset from https://cis.temple.edu/lasot/download.html
+ """
+
+ def __init__(self,
+ root=None,
+ filter=None,
+ image_loader=default_image_loader,
+ vid_ids=None,
+ split=None):
+ """
+ args:
+ root - path to the lasot dataset.
+ image_loader (jpeg4py_loader) - The function to read the images. jpeg4py (https://github.com/ajkxyz/jpeg4py)
+ is used by default.
+ vid_ids - List containing the ids of the videos (1 - 20) used for training. If vid_ids = [1, 3, 5], then the
+ videos with subscripts -1, -3, and -5 from each class will be used for training.
+ split - If split='train', the official train split (protocol-II) is used for training. Note: Only one of
+ vid_ids or split option can be used at a time.
+ """
+ root = env_settings().lasot_dir if root is None else root
+ super().__init__(root, image_loader)
+
+ self.sequence_list = self._build_sequence_list(vid_ids, split)
+ self.filter = filter
+
+ def _build_sequence_list(self, vid_ids=None, split=None):
+ if split is not None:
+ if vid_ids is not None:
+ raise ValueError('Cannot set both split_name and vid_ids.')
+ ltr_path = os.path.join(
+ os.path.dirname(os.path.realpath(__file__)), '..')
+ if split == 'train':
+ file_path = os.path.join(ltr_path, 'data_specs',
+ 'lasot_train_split.txt')
+ else:
+ raise ValueError('Unknown split name.')
+ sequence_list = pandas.read_csv(
+ file_path, header=None, squeeze=True).values.tolist()
+ elif vid_ids is not None:
+ sequence_list = [
+ c + '-' + str(v) for c in self.class_list for v in vid_ids
+ ]
+ else:
+ raise ValueError('Set either split_name or vid_ids.')
+
+ return sequence_list
+
+ def get_name(self):
+ return 'lasot'
+
+ def get_num_sequences(self):
+ return len(self.sequence_list)
+
+ def _read_anno(self, seq_path):
+ anno_file = os.path.join(seq_path, "groundtruth.txt")
+ gt = pandas.read_csv(
+ anno_file,
+ delimiter=',',
+ header=None,
+ dtype=np.float32,
+ na_filter=False,
+ low_memory=False).values
+ return np.array(gt)
+
+ def _read_target_visible(self, seq_path, anno):
+ # Read full occlusion and out_of_view
+ occlusion_file = os.path.join(seq_path, "full_occlusion.txt")
+ out_of_view_file = os.path.join(seq_path, "out_of_view.txt")
+
+ with open(occlusion_file, 'r', newline='') as f:
+ occlusion = np.array([int(v) for v in list(csv.reader(f))[0]],
+ 'byte')
+ with open(out_of_view_file, 'r') as f:
+ out_of_view = np.array([int(v) for v in list(csv.reader(f))[0]],
+ 'byte')
+
+ target_visible = ~occlusion & ~out_of_view & (anno[:, 2] > 0) & (
+ anno[:, 3] > 0)
+
+ return target_visible
+
+ def _get_sequence_path(self, seq_id):
+ seq_name = self.sequence_list[seq_id]
+ class_name = seq_name.split('-')[0]
+ vid_id = seq_name.split('-')[1]
+
+ return os.path.join(self.root, class_name, class_name + '-' + vid_id)
+
+ def get_sequence_info(self, seq_id):
+ seq_path = self._get_sequence_path(seq_id)
+ anno = self._read_anno(seq_path)
+ target_visible = self._read_target_visible(seq_path, anno)
+ if self.filter is not None:
+ target_large = (anno[:, 2] * anno[:, 3] > 30 * 30)
+ ratio = anno[:, 2] / anno[:, 3]
+ target_reasonable_ratio = (10 > ratio) & (ratio > 0.1)
+ target_visible = target_visible & target_reasonable_ratio & target_large
+ return anno, target_visible
+
+ def _get_frame_path(self, seq_path, frame_id):
+ return os.path.join(
+ seq_path, 'img',
+ '{:08}.jpg'.format(frame_id + 1)) # frames start from 1
+
+ def _get_frame(self, seq_path, frame_id):
+ return self.image_loader(self._get_frame_path(seq_path, frame_id))
+
+ def _get_class(self, seq_path):
+ obj_class = seq_path.split('/')[-2]
+ return obj_class
+
+ def get_frames(self, seq_id, frame_ids, anno=None):
+ seq_path = self._get_sequence_path(seq_id)
+
+ obj_class = self._get_class(seq_path)
+ frame_list = [self._get_frame(seq_path, f_id) for f_id in frame_ids]
+
+ if anno is None:
+ anno = self._read_anno(seq_path)
+
+ # Return as list of tensors
+ anno_frames = [anno[f_id, :] for f_id in frame_ids]
+
+ object_meta = OrderedDict({
+ 'object_class': obj_class,
+ 'motion_class': None,
+ 'major_class': None,
+ 'root_class': None,
+ 'motion_adverb': None
+ })
+
+ return frame_list, anno_frames, object_meta
diff --git a/PaddleCV/tracking/ltr/dataset/tracking_net.py b/PaddleCV/tracking/ltr/dataset/tracking_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..62f5cb808f54069f76d60ec59b84d22815deb9d5
--- /dev/null
+++ b/PaddleCV/tracking/ltr/dataset/tracking_net.py
@@ -0,0 +1,117 @@
+import os
+import os.path
+import numpy as np
+import pandas
+from collections import OrderedDict
+
+from ltr.data.image_loader import default_image_loader
+from .base_dataset import BaseDataset
+from ltr.admin.environment import env_settings
+
+
+def list_sequences(root, set_ids):
+ """ Lists all the videos in the input set_ids. Returns a list of tuples (set_id, video_name)
+
+ args:
+ root: Root directory to TrackingNet
+ set_ids: Sets (0-11) which are to be used
+
+ returns:
+ list - list of tuples (set_id, video_name) containing the set_id and video_name for each sequence
+ """
+ sequence_list = []
+
+ for s in set_ids:
+ anno_dir = os.path.join(root, "TRAIN_" + str(s), "anno")
+
+ sequences_cur_set = [(s, os.path.splitext(f)[0])
+ for f in os.listdir(anno_dir)
+ if f.endswith('.txt')]
+ sequence_list += sequences_cur_set
+
+ return sequence_list
+
+
+class TrackingNet(BaseDataset):
+ """ TrackingNet dataset.
+
+ Publication:
+ TrackingNet: A Large-Scale Dataset and Benchmark for Object Tracking in the Wild.
+ Matthias Mueller,Adel Bibi, Silvio Giancola, Salman Al-Subaihi and Bernard Ghanem
+ ECCV, 2018
+ https://ivul.kaust.edu.sa/Documents/Publications/2018/TrackingNet%20A%20Large%20Scale%20Dataset%20and%20Benchmark%20for%20Object%20Tracking%20in%20the%20Wild.pdf
+
+ Download the dataset using the toolkit https://github.com/SilvioGiancola/TrackingNet-devkit.
+ """
+
+ def __init__(self,
+ root=None,
+ image_loader=default_image_loader,
+ set_ids=None):
+ """
+ args:
+ root - The path to the TrackingNet folder, containing the training sets.
+ image_loader (jpeg4py_loader) - The function to read the images. jpeg4py (https://github.com/ajkxyz/jpeg4py)
+ is used by default.
+ set_ids (None) - List containing the ids of the TrackingNet sets to be used for training. If None, all the
+ sets (0 - 11) will be used.
+ """
+ root = env_settings().trackingnet_dir if root is None else root
+ super().__init__(root, image_loader)
+
+ if set_ids is None:
+ set_ids = [i for i in range(12)]
+
+ self.set_ids = set_ids
+
+ # Keep a list of all videos. Sequence list is a list of tuples (set_id, video_name) containing the set_id and
+ # video_name for each sequence
+ self.sequence_list = list_sequences(self.root, self.set_ids)
+
+ def get_name(self):
+ return 'trackingnet'
+
+ def _read_anno(self, seq_id):
+ set_id = self.sequence_list[seq_id][0]
+ vid_name = self.sequence_list[seq_id][1]
+ anno_file = os.path.join(self.root, "TRAIN_" + str(set_id), "anno",
+ vid_name + ".txt")
+ gt = pandas.read_csv(
+ anno_file,
+ delimiter=',',
+ header=None,
+ dtype=np.float32,
+ na_filter=False,
+ low_memory=False).values
+ return np.array(gt)
+
+ def get_sequence_info(self, seq_id):
+ anno = self._read_anno(seq_id)
+ target_visible = (anno[:, 2] > 0) & (anno[:, 3] > 0)
+ return anno, target_visible
+
+ def _get_frame(self, seq_id, frame_id):
+ set_id = self.sequence_list[seq_id][0]
+ vid_name = self.sequence_list[seq_id][1]
+ frame_path = os.path.join(self.root, "TRAIN_" + str(set_id), "frames",
+ vid_name, str(frame_id) + ".jpg")
+ return self.image_loader(frame_path)
+
+ def get_frames(self, seq_id, frame_ids, anno=None):
+ frame_list = [self._get_frame(seq_id, f) for f in frame_ids]
+
+ if anno is None:
+ anno = self._read_anno(seq_id)
+
+ # Return as list of tensors
+ anno_frames = [anno[f_id, :] for f_id in frame_ids]
+
+ object_meta = OrderedDict({
+ 'object_class': None,
+ 'motion_class': None,
+ 'major_class': None,
+ 'root_class': None,
+ 'motion_adverb': None
+ })
+
+ return frame_list, anno_frames, object_meta
diff --git a/PaddleCV/tracking/ltr/dataset/vot.py b/PaddleCV/tracking/ltr/dataset/vot.py
new file mode 100644
index 0000000000000000000000000000000000000000..3720a0585642af8b7b1deadea8009d17f977869c
--- /dev/null
+++ b/PaddleCV/tracking/ltr/dataset/vot.py
@@ -0,0 +1,140 @@
+import os
+from .base_dataset import BaseDataset
+from ltr.data.image_loader import default_image_loader
+import numpy as np
+import cv2 as cv
+from collections import OrderedDict
+from ltr.admin.environment import env_settings
+
+
+def get_axis_aligned_bbox(region):
+ region = np.array(region)
+ if len(region.shape) == 3:
+ # region (1,4,2)
+ region = np.array([
+ region[0][0][0], region[0][0][1], region[0][1][0], region[0][1][1],
+ region[0][2][0], region[0][2][1], region[0][3][0], region[0][3][1]
+ ])
+
+ cx = np.mean(region[0::2])
+ cy = np.mean(region[1::2])
+ x1 = min(region[0::2])
+
+ x2 = max(region[0::2])
+ y1 = min(region[1::2])
+ y2 = max(region[1::2])
+
+ A1 = np.linalg.norm(region[0:2] - region[2:4]) * np.linalg.norm(region[
+ 2:4] - region[4:6])
+ A2 = (x2 - x1) * (y2 - y1)
+ s = np.sqrt(A1 / A2)
+ w = s * (x2 - x1) + 1
+ h = s * (y2 - y1) + 1
+
+ x11 = cx - w // 2
+ y11 = cy - h // 2
+
+ return x11, y11, w, h
+
+
+class VOT(BaseDataset):
+ def __init__(self, root=None, image_loader=default_image_loader):
+ # root = env_settings().vot_dir if root is None else root
+ assert root is not None
+ super().__init__(root, image_loader)
+
+ self.sequence_list = self._get_sequence_list()
+ self.ann = self._get_annotations()
+
+ def _get_sequence_list(self):
+ seq_list = []
+ for d in os.listdir(self.root):
+ if os.path.isdir(os.path.join(self.root, d)):
+ seq_list.append(d)
+ return sorted(seq_list)
+
+ def _get_annotations(self):
+ ann = {}
+ for seq in self.sequence_list:
+ ann[seq] = {'bbox': [], 'rbb': []}
+ with open(os.path.join(self.root, seq, 'groundtruth.txt')) as f:
+ lines = [l.strip().split(',') for l in f.readlines()]
+ for l in lines:
+ vs = [float(v) for v in l]
+ if len(vs) == 4:
+ polys = [
+ vs[0], vs[1] + vs[3] - 1, vs[0], vs[1],
+ vs[0] + vs[2] - 1, vs[1], vs[0] + vs[2] - 1,
+ vs[1] + vs[3] - 1
+ ]
+ else:
+ polys = vs
+
+ box = get_axis_aligned_bbox(polys)
+ rbb = cv.minAreaRect(
+ np.int0(np.array(polys).reshape((-1, 2))))
+ # assume small rotation angle, switch height, width
+ if rbb[2] < -45:
+ angle = rbb[2] + 90
+ height = rbb[1][0]
+ width = rbb[1][1]
+ else:
+ angle = rbb[2]
+ height = rbb[1][1]
+ width = rbb[1][0]
+ rbb = [rbb[0][0], rbb[0][1], width, height, angle]
+ ann[seq]['bbox'].append(box)
+ ann[seq]['rbb'].append(rbb)
+ return ann
+
+ def is_video_sequence(self):
+ return True
+
+ def get_name(self):
+ return 'vot'
+
+ def get_num_sequences(self):
+ return len(self.sequence_list)
+
+ def get_sequence_info(self, seq_id):
+ anno = self._get_anno(seq_id)
+ target_visible = (anno[:, 2] > 0) & (anno[:, 3] > 0)
+ return anno, target_visible
+
+ def _get_anno(self, seq_id):
+ anno = self.ann[self.sequence_list[seq_id]]['bbox']
+ return np.reshape(np.array(anno), (-1, 4))
+
+ def get_meta_info(self, seq_id):
+ object_meta = OrderedDict({
+ 'object_class': None,
+ 'motion_class': None,
+ 'major_class': None,
+ 'root_class': None,
+ 'motion_adverb': None
+ })
+ return object_meta
+
+ def _get_sequence_path(self, seq_id):
+ return os.path.join(self.root, self.sequence_list[seq_id])
+
+ def _get_frame_path(self, seq_path, frame_id):
+ return os.path.join(
+ seq_path, 'color',
+ '{:08}.jpg'.format(frame_id + 1)) # frames start from 1
+
+ def _get_frame(self, seq_path, frame_id):
+ return self.image_loader(self._get_frame_path(seq_path, frame_id))
+
+ def get_frames(self, seq_id=None, frame_ids=None, anno=None):
+ seq_path = self._get_sequence_path(seq_id)
+ frame_list = [self._get_frame(seq_path, f_id) for f_id in frame_ids]
+
+ if anno is None:
+ anno = self._get_anno(seq_id)
+
+ anno_frames = [anno[f_id, :] for f_id in frame_ids]
+
+ object_meta = self.get_meta_info(seq_id)
+
+ return frame_list, anno_frames, object_meta
diff --git a/PaddleCV/tracking/ltr/dataset/youtube_bb.py b/PaddleCV/tracking/ltr/dataset/youtube_bb.py
new file mode 100644
index 0000000000000000000000000000000000000000..5628c5714d50976b6718874776a2cd01403ec8e2
--- /dev/null
+++ b/PaddleCV/tracking/ltr/dataset/youtube_bb.py
@@ -0,0 +1,114 @@
+import os
+from .base_dataset import BaseDataset
+from ltr.data.image_loader import default_image_loader
+import xml.etree.ElementTree as ET
+import json
+import pickle
+from collections import OrderedDict
+import numpy as np
+import nltk
+from nltk.corpus import wordnet
+from ltr.admin.environment import env_settings
+
+
+def get_target_to_image_ratio(seq):
+ anno = np.array(seq['anno'])
+ img_sz = np.array(seq['image_size'])
+ return np.sqrt(anno[0, 2:4].prod() / (img_sz.prod()))
+
+
+class YoutubeBB(BaseDataset):
+ """ YoutubeBB dataset.
+
+ Publication:
+ ImageNet Large Scale Visual Recognition Challenge
+ Olga Russakovsky, Jia Deng, Hao Su, Jonathan Krause, Sanjeev Satheesh, Sean Ma, Zhiheng Huang, Andrej Karpathy,
+ Aditya Khosla, Michael Bernstein, Alexander C. Berg and Li Fei-Fei
+ IJCV, 2015
+ https://arxiv.org/pdf/1409.0575.pdf
+
+ Download the dataset from http://image-net.org/
+ """
+
+ def __init__(self,
+ root=None,
+ filter=None,
+ image_loader=default_image_loader,
+ min_length=0,
+ max_target_area=1):
+ """
+ args:
+ root - path to the imagenet vid dataset.
+ image_loader (jpeg4py_loader) - The function to read the images. jpeg4py (https://github.com/ajkxyz/jpeg4py)
+ is used by default.
+ min_length - Minimum allowed sequence length.
+ max_target_area - max allowed ratio between target area and image area. Can be used to filter out targets
+ which cover complete image.
+ """
+ super().__init__(root, image_loader)
+
+ meta_file = os.path.join(root, 'ytb_meta.pickle')
+ with open(meta_file, 'rb') as f:
+ meta = pickle.load(f)
+
+ sequence_list = []
+ for video_name, video_info in meta:
+ if 'ILSVRC' not in video_name:
+ seq_info = {}
+ for trkid in video_info:
+ if len(video_info[trkid]['img']) > 2:
+ seq_info['video_name'] = video_name
+ seq_info['anno'] = video_info[trkid]['box']
+ seq_info['img_paths'] = video_info[trkid]['img']
+ sequence_list.append(seq_info)
+
+ print('num_sequences: {}'.format(len(sequence_list)))
+ self.sequence_list = sequence_list
+
+ # Filter the sequences based on min_length and max_target_area in the first frame
+ # self.sequence_list = [x for x in self.sequence_list if len(x['anno']) >= min_length and
+ # get_target_to_image_ratio(x) < max_target_area]
+ self.filter = filter
+
+ def get_name(self):
+ return 'youtubebb'
+
+ def get_num_sequences(self):
+ return len(self.sequence_list)
+
+ def get_sequence_info(self, seq_id):
+ anno = np.array(self.sequence_list[seq_id]['anno'])
+ target_visible = (anno[:, 2] > 0) & (anno[:, 3] > 0)
+ if self.filter is not None:
+ target_large = (anno[:, 2] * anno[:, 3] > 30 * 30)
+ target_resonable = (anno[:, 2] * anno[:, 3] < 500 * 500)
+ ratio = anno[:, 2] / anno[:, 3]
+ target_reasonable_ratio = (10 > ratio) & (ratio > 0.1)
+ target_visible = target_visible & target_reasonable_ratio & target_large & target_resonable
+ return anno, target_visible
+
+ def _get_frame(self, sequence, frame_id):
+ frame_path = os.path.join(self.root, sequence['video_name'],
+ sequence['img_paths'][frame_id] + '.jpg')
+ return self.image_loader(frame_path)
+
+ def get_frames(self, seq_id, frame_ids, anno=None):
+ sequence = self.sequence_list[seq_id]
+ frame_list = [self._get_frame(sequence, f) for f in frame_ids]
+
+ if anno is None:
+ anno = sequence['anno']
+
+ # Return as list of tensors
+ anno_frames = [anno[f_id, :] for f_id in frame_ids]
+
+ # added the class info to the meta info
+ object_meta = OrderedDict({
+ 'object_class': None,
+ 'motion_class': None,
+ 'major_class': None,
+ 'root_class': None,
+ 'motion_adverb': None
+ })
+
+ return frame_list, anno_frames, object_meta
diff --git a/PaddleCV/tracking/ltr/dataset/youtube_vos.py b/PaddleCV/tracking/ltr/dataset/youtube_vos.py
new file mode 100644
index 0000000000000000000000000000000000000000..f884272f4bab666b740cc80461de7056ec39ed96
--- /dev/null
+++ b/PaddleCV/tracking/ltr/dataset/youtube_vos.py
@@ -0,0 +1,152 @@
+import os
+from .base_dataset import BaseDataset
+from ltr.data.image_loader import default_image_loader
+import numpy as np
+import cv2 as cv
+import json
+from collections import OrderedDict
+from ltr.admin.environment import env_settings
+
+
+def get_axis_aligned_bbox(region):
+ region = np.array(region)
+ if len(region.shape) == 3:
+ # region (1,4,2)
+ region = np.array([
+ region[0][0][0], region[0][0][1], region[0][1][0], region[0][1][1],
+ region[0][2][0], region[0][2][1], region[0][3][0], region[0][3][1]
+ ])
+
+ cx = np.mean(region[0::2])
+ cy = np.mean(region[1::2])
+ x1 = min(region[0::2])
+
+ x2 = max(region[0::2])
+ y1 = min(region[1::2])
+ y2 = max(region[1::2])
+
+ A1 = np.linalg.norm(region[0:2] - region[2:4]) * np.linalg.norm(region[
+ 2:4] - region[4:6])
+ A2 = (x2 - x1) * (y2 - y1)
+ s = np.sqrt(A1 / A2)
+ if s is np.nan:
+ x11, y11, w, h = 0, 0, 0, 0
+ else:
+ w = s * (x2 - x1) + 1
+ h = s * (y2 - y1) + 1
+
+ x11 = cx - w // 2
+ y11 = cy - h // 2
+ return x11, y11, w, h
+
+
+class VOS(BaseDataset):
+ def __init__(self, root=None, image_loader=default_image_loader):
+ # root = env_settings().vot_dir if root is None else root
+ assert root is not None
+ super().__init__(root, image_loader)
+
+ with open(os.path.join(self.root, 'meta.json')) as f:
+ self.meta = json.load(f)['videos']
+
+ self.sequence_list = self._get_sequence_list()
+ self.ann = self._get_annotations()
+
+ def _get_sequence_list(self):
+ seq_list = []
+ videos = self.meta.keys()
+ for v in videos:
+ objs = self.meta[v]['objects'].keys()
+ for o in objs:
+ if "rotate_box" in self.meta[v]['objects'][o]:
+ seq_list.append((v, o))
+ assert len(seq_list) > 0
+ return seq_list
+
+ def _get_annotations(self):
+ ann = {}
+ for seq in self.sequence_list:
+ ann[seq] = {'bbox': [], 'rbb': []}
+ polygons = self.meta[seq[0]]['objects'][seq[1]]['rotate_box']
+ for vs in polygons:
+ if len(vs) == 4:
+ polys = [
+ vs[0], vs[1] + vs[3] - 1, vs[0], vs[1],
+ vs[0] + vs[2] - 1, vs[1], vs[0] + vs[2] - 1,
+ vs[1] + vs[3] - 1
+ ]
+ else:
+ polys = vs
+ if not np.all(polys == 0):
+ box = get_axis_aligned_bbox(polys)
+ rbb = cv.minAreaRect(
+ np.int0(np.array(polys).reshape((-1, 2))))
+ else:
+ box = np.array([0, 0, 0, 0])
+ rbb = ((0, 0), (0, 0), 0)
+ if box[2] * box[3] > 500 * 500:
+ print(box)
+ # assume small rotation angle, switch height, width
+ if rbb[2] < -45:
+ angle = rbb[2] + 90
+ height = rbb[1][0]
+ width = rbb[1][1]
+ else:
+ angle = rbb[2]
+ height = rbb[1][1]
+ width = rbb[1][0]
+ rbb = [rbb[0][0], rbb[0][1], width, height, angle]
+ ann[seq]['bbox'].append(box)
+ ann[seq]['rbb'].append(rbb)
+ return ann
+
+ def is_video_sequence(self):
+ return True
+
+ def get_name(self):
+ return 'vot'
+
+ def get_num_sequences(self):
+ return len(self.sequence_list)
+
+ def get_sequence_info(self, seq_id):
+ anno = self._get_anno(seq_id)
+ target_visible = (anno[:, 2] > 0) & (anno[:, 3] > 0)
+ target_large = (anno[:, 2] * anno[:, 3] > 30 * 30)
+ target_resonable = (anno[:, 2] * anno[:, 3] < 500 * 500)
+ return anno, target_visible & target_large & target_resonable
+
+ def _get_anno(self, seq_id):
+ anno = self.ann[self.sequence_list[seq_id]]['bbox']
+ return np.reshape(np.array(anno), (-1, 4))
+
+ def get_meta_info(self, seq_id):
+ object_meta = OrderedDict({
+ 'object_class': None,
+ 'motion_class': None,
+ 'major_class': None,
+ 'root_class': None,
+ 'motion_adverb': None
+ })
+ return object_meta
+
+ def _get_frame_path(self, seq_id, frame_id):
+ v, o = self.sequence_list[seq_id]
+ frame_name = self.meta[v]['objects'][o]['frames'][frame_id]
+ return os.path.join(self.root, 'JPEGImages', v,
+ '{}.jpg'.format(frame_name)) # frames start from 1
+
+ def _get_frame(self, seq_id, frame_id):
+ return self.image_loader(self._get_frame_path(seq_id, frame_id))
+
+ def get_frames(self, seq_id=None, frame_ids=None, anno=None):
+ frame_list = [self._get_frame(seq_id, f_id) for f_id in frame_ids]
+
+ if anno is None:
+ anno = self._get_anno(seq_id)
+
+ anno_frames = [anno[f_id, :] for f_id in frame_ids]
+
+ object_meta = self.get_meta_info(seq_id)
+
+ return frame_list, anno_frames, object_meta
diff --git a/PaddleCV/tracking/ltr/models/backbone/resnet.py b/PaddleCV/tracking/ltr/models/backbone/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..a576ad577f64499cafabc4c2185d84e93130c027
--- /dev/null
+++ b/PaddleCV/tracking/ltr/models/backbone/resnet.py
@@ -0,0 +1,322 @@
+import os
+
+import paddle.fluid as fluid
+import paddle.fluid.dygraph.nn as nn
+from ltr.admin.environment import env_settings
+
+CURRENT_DIR = os.path.dirname(__file__)
+
+
+def weight_init():
+ init = fluid.initializer.MSRAInitializer(uniform=False)
+ param = fluid.ParamAttr(initializer=init)
+ return param
+
+
+def norm_weight_init(constant=1.0):
+ init = fluid.initializer.ConstantInitializer(constant)
+ param = fluid.ParamAttr(initializer=init)
+ return param
+
+
+def norm_bias_init():
+ init = fluid.initializer.ConstantInitializer(value=0.)
+ param = fluid.ParamAttr(initializer=init)
+ return param
+
+
+class ConvBNLayer(fluid.dygraph.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ filter_size,
+ stride=1,
+ groups=1,
+ bn_init_constant=1.0,
+ is_test=False):
+ super(ConvBNLayer, self).__init__()
+
+ self.conv = nn.Conv2D(
+ num_channels=in_channels,
+ filter_size=filter_size,
+ num_filters=out_channels,
+ stride=stride,
+ padding=(filter_size - 1) // 2,
+ groups=groups,
+ bias_attr=False,
+ param_attr=weight_init())
+ self.bn = nn.BatchNorm(
+ out_channels,
+ param_attr=norm_weight_init(bn_init_constant),
+ bias_attr=norm_bias_init(),
+ act=None,
+ momentum=0.9,
+ use_global_stats=is_test)
+
+ def forward(self, inputs):
+ res = self.conv(inputs)
+ self.conv_res = res
+ res = self.bn(res)
+ return res
+
+
+class BasicBlock(fluid.dygraph.Layer):
+ expansion = 1
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ stride=1,
+ is_downsample=None,
+ is_test=False):
+
+ super(BasicBlock, self).__init__()
+ self.expansion = 1
+
+ self.conv_bn1 = ConvBNLayer(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ filter_size=3,
+ stride=stride,
+ groups=1,
+ is_test=is_test)
+ self.conv_bn2 = ConvBNLayer(
+ in_channels=out_channels,
+ out_channels=out_channels,
+ filter_size=3,
+ stride=1,
+ groups=1,
+ is_test=is_test)
+
+ self.is_downsample = is_downsample
+ if self.is_downsample:
+ self.downsample = ConvBNLayer(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ filter_size=1,
+ stride=stride,
+ is_test=is_test)
+ self.stride = stride
+
+ def forward(self, inputs):
+ identity = inputs
+ res = self.conv_bn1(inputs)
+ res = fluid.layers.relu(res)
+
+ res = self.conv_bn2(res)
+
+ if self.is_downsample:
+ identity = self.downsample(identity)
+
+ res += identity
+ res = fluid.layers.relu(res)
+ return res
+
+
+class Bottleneck(fluid.dygraph.Layer):
+ expansion = 4
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ stride=1,
+ is_downsample=None,
+ base_width=64,
+ dilation=1,
+ groups=1,
+ is_test=False):
+ super(Bottleneck, self).__init__()
+
+ width = int(out_channels * (base_width / 64.)) * groups
+
+ self.conv_bn1 = ConvBNLayer(
+ in_channels=in_channels,
+ filter_size=1,
+ out_channels=width,
+ groups=1,
+ is_test=is_test)
+ self.conv_bn2 = ConvBNLayer(
+ in_channels=width,
+ filter_size=3,
+ out_channels=width,
+ stride=stride,
+ groups=groups,
+ is_test=is_test)
+ self.conv_bn3 = ConvBNLayer(
+ in_channels=width,
+ filter_size=1,
+ out_channels=out_channels * self.expansion,
+ bn_init_constant=0.,
+ is_test=is_test)
+ self.is_downsample = is_downsample
+ if self.is_downsample:
+ self.downsample = ConvBNLayer(
+ in_channels=in_channels,
+ out_channels=out_channels * self.expansion,
+ filter_size=1,
+ stride=stride,
+ is_test=is_test)
+
+ self.stride = stride
+
+ def forward(self, inputs):
+ identify = inputs
+
+ out = self.conv_bn1(inputs)
+ out = fluid.layers.relu(out)
+
+ out = self.conv_bn2(out)
+ out = fluid.layers.relu(out)
+
+ out = self.conv_bn3(out)
+
+ if self.is_downsample:
+ identify = self.downsample(inputs)
+
+ out += identify
+ out = fluid.layers.relu(out)
+ return out
+
+
+class ResNet(fluid.dygraph.Layer):
+ def __init__(self,
+ name,
+ Block,
+ layers,
+ num_classes=1000,
+ groups=1,
+ is_test=False):
+ """
+
+ :param name: str, namescope
+ :param layers: int, the layer of defined network
+ :param num_classes: int, the dimension of final output
+ :param groups: int, default is 1
+ """
+ super(ResNet, self).__init__(name_scope=name)
+
+ support_layers = [18, 34, 50, 101, 152]
+ assert layers in support_layers, \
+ "support layer can only be one of [18, 34, 50, 101, 152]"
+ self.layers = layers
+
+ if layers == 18:
+ depths = [2, 2, 2, 2]
+ elif layers == 50 or layers == 34:
+ depths = [3, 4, 6, 3]
+ elif layers == 101:
+ depths = [3, 4, 23, 3]
+ elif layers == 152:
+ depths = [3, 8, 36, 3]
+
+ strides = [1, 2, 2, 2]
+ num_filters = [64, 128, 256, 512]
+
+ self.in_channels = 64
+ self.dilation = 1
+ self.groups = groups
+
+ self.conv_bn_init = ConvBNLayer(
+ 3,
+ out_channels=self.in_channels,
+ filter_size=7,
+ stride=2,
+ is_test=is_test)
+
+ block_collect = []
+ downsample = None
+ for i in range(len(depths)):
+ # collect layers in each block
+ _block = []
+
+ stride = strides[i]
+ out_channel = num_filters[i]
+
+ if stride != 1 or self.in_channels != num_filters[
+ i] * Block.expansion:
+ downsample = True
+ bottleneck_block = self.add_sublayer(
+ "block{}_0".format(i),
+ Block(
+ self.in_channels,
+ out_channel,
+ stride=stride,
+ is_downsample=downsample,
+ is_test=is_test))
+
+ downsample = False
+
+ _block.append(bottleneck_block)
+
+ self.in_channels = num_filters[i] * Block.expansion
+
+ for j in range(1, depths[i]):
+ bottleneck_block = self.add_sublayer(
+ "block{}_{}".format(i, j),
+ Block(
+ self.in_channels, out_channel, is_test=is_test))
+ _block.append(bottleneck_block)
+
+ # collect blocks
+ block_collect.append(_block)
+
+ self.block_collect = block_collect
+
+ self.maxpool = nn.Pool2D(
+ pool_size=3, pool_stride=2, pool_padding=1, pool_type="max")
+
+ self.global_pool = nn.Pool2D(pool_type='avg', global_pooling=True)
+ self.fc = nn.Linear(
+ input_dim=512 * Block.expansion, output_dim=num_classes)
+
+ def _add_output_and_check(self, name, x, outputs, output_layers):
+ if name in output_layers:
+ outputs[name] = x
+ return len(output_layers) == len(outputs)
+
+ def forward(self, inputs, feat_layers):
+ out = {}
+ res = self.conv_bn_init(inputs)
+ res = fluid.layers.relu(res)
+ res = self.maxpool(res)
+
+ # out['conv_init'] = res
+ for i in range(len(self.block_collect)):
+
+ for layer in self.block_collect[i]:
+ res = layer(res)
+
+ name = 'block{}'.format(i)
+ if name in feat_layers:
+ out[name] = res
+ if len(out) == len(feat_layers):
+ return out
+
+ res = self.global_pool(res)
+ B, C, _, _ = res.shape
+ res = fluid.layers.reshape(res, [B, C])
+ res = self.fc(res)
+ out['fc'] = res
+ return out
+
+
+def resnet18(name, is_test=False, pretrained=False):
+ net = ResNet(name, Block=BasicBlock, layers=18, is_test=is_test)
+ if pretrained:
+ params_path = os.path.join(env_settings().backbone_dir, 'ResNet18')
+ print("=> loading backbone model from '{}'".format(params_path))
+ params, _ = fluid.load_dygraph(params_path)
+ net.load_dict(params)
+ print("Done")
+ return net
+
+
+def resnet50(name, is_test=False, pretrained=False):
+ net = ResNet(name, Block=Bottleneck, layers=50, is_test=is_test)
+ if pretrained:
+ params_path = os.path.join(env_settings().backbone_dir, 'ResNet50')
+ print("=> loading backbone model from '{}'".format(params_path))
+ params, _ = fluid.load_dygraph(params_path)
+ net.load_dict(params)
+ print("Done")
+ return net
diff --git a/PaddleCV/tracking/ltr/models/backbone/sfc_alexnet.py b/PaddleCV/tracking/ltr/models/backbone/sfc_alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c80df9115f9b34d9ef30ba59c54b45d9b8da87a
--- /dev/null
+++ b/PaddleCV/tracking/ltr/models/backbone/sfc_alexnet.py
@@ -0,0 +1,149 @@
+from collections import OrderedDict
+
+from paddle import fluid
+from paddle.fluid.dygraph import nn
+
+
+class SFC_AlexNet(fluid.dygraph.Layer):
+ def __init__(self, name, is_test):
+ super(SFC_AlexNet, self).__init__()
+
+ self.is_test = is_test
+ self.layer_init()
+
+ def layer_init(self):
+ # for conv1
+ self.conv1 = nn.Conv2D(
+ num_channels=3,
+ num_filters=96,
+ filter_size=11,
+ stride=2,
+ padding=0,
+ groups=1,
+ param_attr=self.weight_init(),
+ bias_attr=self.bias_init())
+ self.bn1 = nn.BatchNorm(
+ num_channels=96,
+ is_test=self.is_test,
+ param_attr=self.norm_weight_init(),
+ bias_attr=self.bias_init(),
+ use_global_stats=self.is_test)
+ self.pool1 = nn.Pool2D(
+ pool_size=3, pool_type="max", pool_stride=2, pool_padding=0)
+ # for conv2
+ self.conv2 = nn.Conv2D(
+ num_channels=96,
+ num_filters=256,
+ filter_size=5,
+ stride=1,
+ padding=0,
+ groups=2,
+ param_attr=self.weight_init(),
+ bias_attr=self.bias_init())
+ self.bn2 = nn.BatchNorm(
+ num_channels=256,
+ is_test=self.is_test,
+ param_attr=self.norm_weight_init(),
+ bias_attr=self.bias_init(),
+ use_global_stats=self.is_test)
+ self.pool2 = nn.Pool2D(
+ pool_size=3, pool_type="max", pool_stride=2, pool_padding=0)
+ # for conv3
+ self.conv3 = nn.Conv2D(
+ num_channels=256,
+ num_filters=384,
+ filter_size=3,
+ stride=1,
+ padding=0,
+ groups=1,
+ param_attr=self.weight_init(),
+ bias_attr=self.bias_init())
+ self.bn3 = nn.BatchNorm(
+ num_channels=384,
+ is_test=self.is_test,
+ param_attr=self.norm_weight_init(),
+ bias_attr=self.bias_init(),
+ use_global_stats=self.is_test)
+ # for conv4
+ self.conv4 = nn.Conv2D(
+ num_channels=384,
+ num_filters=384,
+ filter_size=3,
+ stride=1,
+ padding=0,
+ groups=2,
+ param_attr=self.weight_init(),
+ bias_attr=self.bias_init())
+ self.bn4 = nn.BatchNorm(
+ num_channels=384,
+ is_test=self.is_test,
+ param_attr=self.norm_weight_init(),
+ bias_attr=self.bias_init(),
+ use_global_stats=self.is_test)
+ # for conv5
+ self.conv5 = nn.Conv2D(
+ num_channels=384,
+ num_filters=256,
+ filter_size=3,
+ stride=1,
+ padding=0,
+ groups=2,
+ param_attr=self.weight_init(),
+ bias_attr=self.bias_init())
+
+ def _add_output_and_check(self, name, x, outputs, output_layers):
+ if name in output_layers:
+ outputs[name] = x
+ return len(output_layers) == len(outputs)
+
+ def forward(self, inputs, output_layers):
+ outputs = OrderedDict()
+
+ out1 = self.conv1(inputs)
+ out1 = self.bn1(out1)
+ out1 = fluid.layers.relu(out1)
+ if self._add_output_and_check('conv1', out1, outputs, output_layers):
+ return outputs
+
+ out1 = self.pool1(out1)
+
+ out2 = self.conv2(out1)
+ out2 = self.bn2(out2)
+ out2 = fluid.layers.relu(out2)
+ if self._add_output_and_check('conv2', out2, outputs, output_layers):
+ return outputs
+
+ out2 = self.pool2(out2)
+
+ out3 = self.conv3(out2)
+ out3 = self.bn3(out3)
+ out3 = fluid.layers.relu(out3)
+ if self._add_output_and_check('conv3', out3, outputs, output_layers):
+ return outputs
+
+ out4 = self.conv4(out3)
+ out4 = self.bn4(out4)
+ out4 = fluid.layers.relu(out4)
+ if self._add_output_and_check('conv4', out4, outputs, output_layers):
+ return outputs
+
+ out5 = self.conv5(out4)
+ if self._add_output_and_check('conv5', out5, outputs, output_layers):
+ return outputs
+
+ return outputs
+
+ def norm_weight_init(self):
+ init = fluid.initializer.ConstantInitializer(1.0)
+ param = fluid.ParamAttr(initializer=init)
+ return param
+
+ def weight_init(self):
+ init = fluid.initializer.MSRAInitializer(uniform=False)
+ param = fluid.ParamAttr(initializer=init)
+ return param
+
+ def bias_init(self):
+ init = fluid.initializer.ConstantInitializer(value=0.)
+ param = fluid.ParamAttr(initializer=init)
+ return param
diff --git a/PaddleCV/tracking/ltr/models/bbreg/__init__.py b/PaddleCV/tracking/ltr/models/bbreg/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f2a4538cf68a96c1a3384eb197fbe5e53d500b7
--- /dev/null
+++ b/PaddleCV/tracking/ltr/models/bbreg/__init__.py
@@ -0,0 +1 @@
+from .atom_iou_net import AtomIouNet
diff --git a/PaddleCV/tracking/ltr/models/bbreg/atom.py b/PaddleCV/tracking/ltr/models/bbreg/atom.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef0ebd2e84964e98efd55454c8fc1d488a13cc66
--- /dev/null
+++ b/PaddleCV/tracking/ltr/models/bbreg/atom.py
@@ -0,0 +1,149 @@
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dygraph
+import os.path as osp
+import sys
+
+CURRENT_DIR = osp.dirname(__file__)
+sys.path.append(osp.join(CURRENT_DIR, '..', '..', '..'))
+
+from ltr.models.backbone.resnet import resnet50, resnet18
+from ltr.models.bbreg.atom_iou_net import AtomIouNet
+
+
+class ATOMnet(dygraph.layers.Layer):
+ def __init__(self,
+ name,
+ feature_extractor,
+ bb_regressor,
+ bb_regressor_layer,
+ extractor_grad=True):
+ """
+
+ :param feature_extractor: backbone
+ :param bb_regressor: IOUnet
+ :param bb_regressor_layer: list, which layer is used in IOUnet,
+ :param extractor_grad: default is True
+ """
+ super(ATOMnet, self).__init__(name)
+
+ self.feature_extractor = feature_extractor
+ self.bb_regressor = bb_regressor
+ self.bb_regressor_layer = bb_regressor_layer
+
+ layers_gt = ['block0', 'block1', 'block2', 'block3', 'fc']
+ if bb_regressor_layer is not None:
+ for key in bb_regressor_layer:
+ assert key in layers_gt
+ else:
+ raise ValueError("bb_regressor_layer can only be one of :",
+ layers_gt)
+
+ def forward(self, train_imgs, test_imgs, train_bb, test_proposals):
+ num_sequences = train_imgs.shape[-4]
+ num_train_images = train_imgs.shape[0] if len(
+ train_imgs.shape) == 5 else 1
+ num_test_images = test_imgs.shape[0] if len(test_imgs.shape) == 5 else 1
+
+ if len(train_imgs.shape) == 5:
+ train_imgs = fluid.layers.reshape(
+ train_imgs, [-1, *list(train_imgs.shape)[-3:]])
+ test_imgs = fluid.layers.reshape(test_imgs,
+ [-1, *list(test_imgs.shape)[-3:]])
+
+ train_feat = self.extract_backbone_features(train_imgs)
+ test_feat = self.extract_backbone_features(test_imgs)
+
+ # For clarity, send the features to bb_regressor in sequenceform, i.e. [sequence, batch, feature, row, col]
+ train_feat_iou = [
+ fluid.layers.reshape(feat, (num_train_images, num_sequences,
+ *feat.shape[-3:]))
+ for feat in train_feat.values()
+ ]
+ test_feat_iou = [
+ fluid.layers.reshape(feat, (num_test_images, num_sequences,
+ *feat.shape[-3:]))
+ for feat in test_feat.values()
+ ]
+
+ # Obtain iou prediction
+ iou_pred = self.bb_regressor(train_feat_iou, test_feat_iou, train_bb,
+ test_proposals)
+ return iou_pred
+
+ def extract_backbone_features(self, im, layers=None):
+ if layers is None:
+ layers = self.bb_regressor_layer
+ return self.feature_extractor(im, layers)
+
+ def extract_features(self, im, layers):
+ return self.feature_extractor(im, layers)
+
+
+def atom_resnet18(iou_input_dim=(256, 256),
+ iou_inter_dim=(256, 256),
+ backbone_pretrained=True,
+ backbone_is_test=False,
+ iounet_is_test=False):
+ backbone = resnet18(
+ 'ResNet18', is_test=backbone_is_test, pretrained=backbone_pretrained)
+ iou_predictor = AtomIouNet(
+ 'IOUnet',
+ pred_input_dim=iou_input_dim,
+ pred_inter_dim=iou_inter_dim,
+ is_test=iounet_is_test)
+
+ model = ATOMnet(
+ 'ATOM',
+ feature_extractor=backbone,
+ bb_regressor=iou_predictor,
+ bb_regressor_layer=['block1', 'block2'],
+ extractor_grad=False)
+ return model
+
+
+def atom_resnet50(iou_input_dim=(256, 256),
+ iou_inter_dim=(256, 256),
+ backbone_pretrained=True,
+ backbone_is_test=False,
+ iounet_is_test=False):
+ backbone = resnet50(
+ 'ResNet50', is_test=backbone_is_test, pretrained=backbone_pretrained)
+ iou_predictor = AtomIouNet(
+ 'IOUnet',
+ input_dim=(512, 1024),
+ pred_input_dim=iou_input_dim,
+ pred_inter_dim=iou_inter_dim,
+ is_test=iounet_is_test)
+
+ model = ATOMnet(
+ 'ATOM',
+ feature_extractor=backbone,
+ bb_regressor=iou_predictor,
+ bb_regressor_layer=['block1', 'block2'],
+ extractor_grad=False)
+ return model
+
+
+if __name__ == '__main__':
+ import numpy as np
+
+ a = np.random.uniform(-1, 1, [1, 3, 144, 144]).astype(np.float32)
+ b = np.random.uniform(-1, 1, [1, 3, 144, 144]).astype(np.float32)
+ bbox = [[3, 4, 10, 11]]
+ proposal_bbox = [[4, 5, 11, 12] * 16]
+ bbox = np.reshape(np.array(bbox), [1, 1, 4]).astype(np.float32)
+ proposal_bbox = np.reshape(np.array(proposal_bbox),
+ [1, 16, 4]).astype(np.float32)
+ with fluid.dygraph.guard():
+ a_pd = fluid.dygraph.to_variable(a)
+ b_pd = fluid.dygraph.to_variable(b)
+ bbox_pd = fluid.dygraph.to_variable(bbox)
+ proposal_bbox_pd = fluid.dygraph.to_variable(proposal_bbox)
+
+ model = atom_resnet50()
+
+ res = model(a_pd, b_pd, bbox_pd, proposal_bbox_pd)
+ params = model.state_dict()
+ for v in params:
+ print(v)
diff --git a/PaddleCV/tracking/ltr/models/bbreg/atom_iou_net.py b/PaddleCV/tracking/ltr/models/bbreg/atom_iou_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fcb2f0f4f99692c10e8c0a2c2665cc0e6dffacb
--- /dev/null
+++ b/PaddleCV/tracking/ltr/models/bbreg/atom_iou_net.py
@@ -0,0 +1,350 @@
+"""
+the implementation of ATOM iou net
+"""
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.dygraph.nn as nn
+import numpy as np
+import os.path as osp
+import sys
+
+CURRENT_DIR = osp.dirname(__file__)
+sys.path.append(osp.join(CURRENT_DIR, '..', '..', '..'))
+
+
+def weight_init():
+ init = fluid.initializer.MSRAInitializer(uniform=False)
+ param = fluid.ParamAttr(initializer=init)
+ return param
+
+
+def bias_init():
+ init = fluid.initializer.ConstantInitializer(value=0.)
+ param = fluid.ParamAttr(initializer=init)
+ return param
+
+
+def norm_weight_init():
+ # init = fluid.initializer.ConstantInitializer(1.0)
+ init = fluid.initializer.Uniform(low=0., high=1.)
+ param = fluid.ParamAttr(initializer=init)
+ return param
+
+
+def norm_bias_init():
+ init = fluid.initializer.ConstantInitializer(value=0.)
+ param = fluid.ParamAttr(initializer=init)
+ return param
+
+
+class ConvBNReluLayer(fluid.dygraph.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ filter_size,
+ stride=1,
+ groups=1,
+ padding=1,
+ is_test=False):
+ super(ConvBNReluLayer, self).__init__()
+
+ self.conv = nn.Conv2D(
+ num_channels=in_channels,
+ filter_size=filter_size,
+ num_filters=out_channels,
+ stride=stride,
+ padding=padding,
+ groups=groups,
+ bias_attr=bias_init(),
+ param_attr=weight_init())
+ self.bn = nn.BatchNorm(
+ out_channels,
+ param_attr=norm_weight_init(),
+ bias_attr=norm_bias_init(),
+ act=None,
+ momentum=0.9,
+ use_global_stats=is_test)
+
+ def forward(self, inputs):
+ res = self.conv(inputs)
+ self.conv_res = res
+ res = self.bn(res)
+ res = fluid.layers.relu(res)
+ return res
+
+
+class FCBNReluLayer(fluid.dygraph.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ in_size,
+ is_bias=True,
+ is_bn=True,
+ is_relu=True,
+ is_test=False):
+ super(FCBNReluLayer, self).__init__()
+ self.is_bn = is_bn
+ self.is_relu = is_relu
+
+ if is_bias:
+ bias_init = fluid.ParamAttr(
+ initializer=fluid.initializer.ConstantInitializer(0.))
+ else:
+ bias_init = False
+ self.linear = nn.Linear(
+ in_channels * in_size * in_size, out_channels, bias_attr=bias_init)
+ self.bn = nn.BatchNorm(
+ out_channels,
+ param_attr=norm_weight_init(),
+ bias_attr=norm_bias_init(),
+ act=None,
+ momentum=0.9,
+ use_global_stats=is_test)
+
+ def forward(self, x):
+ x = fluid.layers.reshape(x, [x.shape[0], -1])
+
+ x = self.linear(x)
+ if self.is_bn:
+ x = self.bn(x)
+ if self.is_relu:
+ x = fluid.layers.relu(x)
+ return x
+
+
+class AtomIouNet(fluid.dygraph.Layer):
+ def __init__(self,
+ name,
+ input_dim=(128, 256),
+ pred_input_dim=(256, 256),
+ pred_inter_dim=(256, 256),
+ is_test=False):
+ super(AtomIouNet, self).__init__(name)
+ self.name = self.full_name()
+ self.conv3_1r = ConvBNReluLayer(
+ input_dim[0], 128, filter_size=3, stride=1, is_test=is_test)
+ self.conv3_1t = ConvBNReluLayer(
+ input_dim[0], 256, filter_size=3, stride=1, is_test=is_test)
+
+ self.conv3_2t = ConvBNReluLayer(
+ 256, pred_input_dim[0], filter_size=3, stride=1, is_test=is_test)
+
+ self.fc3_1r = ConvBNReluLayer(
+ 128, 256, filter_size=3, stride=1, padding=0, is_test=is_test)
+
+ self.conv4_1r = ConvBNReluLayer(
+ input_dim[1], 256, filter_size=3, stride=1, is_test=is_test)
+ self.conv4_1t = ConvBNReluLayer(
+ input_dim[1], 256, filter_size=3, stride=1, is_test=is_test)
+
+ self.conv4_2t = ConvBNReluLayer(
+ 256, pred_input_dim[1], filter_size=3, stride=1, is_test=is_test)
+
+ self.fc34_3r = ConvBNReluLayer(
+ 512,
+ pred_input_dim[0],
+ filter_size=1,
+ stride=1,
+ padding=0,
+ is_test=is_test)
+ self.fc34_4r = ConvBNReluLayer(
+ 512,
+ pred_input_dim[1],
+ filter_size=1,
+ stride=1,
+ padding=0,
+ is_test=is_test)
+
+ self.fc3_rt = FCBNReluLayer(
+ pred_input_dim[0], pred_inter_dim[0], in_size=5, is_test=is_test)
+ self.fc4_rt = FCBNReluLayer(
+ pred_input_dim[1], pred_inter_dim[1], in_size=3, is_test=is_test)
+
+ bias_init = fluid.initializer.ConstantInitializer(0.)
+ self.iou_predictor = nn.Linear(
+ pred_inter_dim[0] + pred_inter_dim[1], 1, bias_attr=bias_init)
+
+ self.outs = {}
+
+ def predict_iou(self, filter, feat2, proposals):
+ """
+ predicts IOU for the given proposals
+ :param modulation: Modulation vectors for the targets. Dims (batch, feature_dim).
+ :param feat: IoU features (from get_iou_feat) for test images. Dims (batch, feature_dim, H, W).
+ :param proposals: Proposal boxes for which the IoU will be predicted (batch, num_proposals, 4).
+ :return:
+ """
+ fc34_3_r, fc34_4_r = filter
+ c3_t, c4_t = feat2
+
+ batch_size = c3_t.shape[0]
+
+ # Modulation
+ c3_t_att = c3_t * fluid.layers.reshape(fc34_3_r, [batch_size, -1, 1, 1])
+ c4_t_att = c4_t * fluid.layers.reshape(fc34_4_r, [batch_size, -1, 1, 1])
+
+ # add batch roi nums
+ num_proposals_per_batch = proposals.shape[1]
+ batch_roi_nums = np.array([num_proposals_per_batch] *
+ batch_size).astype(np.int64)
+ batch_roi_nums = fluid.dygraph.to_variable(batch_roi_nums)
+
+ # input proposals2 is in format xywh, convert it to x0y0x1y1 format
+ proposals_xyxy = fluid.layers.concat(
+ [
+ proposals[:, :, 0:2],
+ proposals[:, :, 0:2] + proposals[:, :, 2:4]
+ ],
+ axis=2)
+
+ roi2 = fluid.layers.reshape(proposals_xyxy, [-1, 4])
+ roi2.stop_gradient = False
+
+ roi3t = fluid.layers.prroi_pool(
+ c3_t_att, roi2, 1 / 8., 5, 5, batch_roi_nums=batch_roi_nums)
+ roi4t = fluid.layers.prroi_pool(
+ c4_t_att, roi2, 1 / 16., 3, 3, batch_roi_nums=batch_roi_nums)
+
+ fc3_rt = self.fc3_rt(roi3t)
+ fc4_rt = self.fc4_rt(roi4t)
+
+ fc34_rt_cat = fluid.layers.concat([fc3_rt, fc4_rt], axis=1)
+
+ iou_pred = self.iou_predictor(fc34_rt_cat)
+ iou_pred = fluid.layers.reshape(iou_pred,
+ [batch_size, num_proposals_per_batch])
+
+ return iou_pred
+
+ def forward(self, feat1, feat2, bb1, proposals2):
+ """Runs the ATOM IoUNet during training operation.
+ This forward pass is mainly used for training. Call the individual functions during tracking instead.
+ args:
+ feat1: Variable, Features from the reference frames (4 or 5 dims).
+ feat2: Variable, Features from the test frames (4 or 5 dims).
+ bb1: Target boxes (x,y,x2,y2) in image coords in the reference samples. Dims (images, sequences, 4).
+ proposals2: Proposal boxes for which the IoU will be predicted (images, sequences, num_proposals, 4)."""
+ assert len(feat1[0].shape) == 5, 'Expect 5 dimensional feat1'
+ num_test_images = feat2[0].shape[0]
+ batch_size = feat2[0].shape[1]
+
+ # Extract first train sample
+ feat1 = [f[0] for f in feat1]
+ bb1 = bb1[0]
+
+ # Get modulation vector
+ modulation = self.get_filter(feat1, bb1)
+
+ feat2 = [
+ fluid.layers.reshape(f,
+ (batch_size * num_test_images, *f.shape[-3:]))
+ for f in feat2
+ ]
+ iou_feat = self.get_iou_feat(feat2)
+
+ new_modulation = []
+ for i in range(0, len(modulation)):
+ tmp = modulation[i]
+ tmp = fluid.layers.reshape(tmp, [1, batch_size, -1])
+ tmp = fluid.layers.expand(tmp, [num_test_images, 1, 1])
+ tmp = fluid.layers.reshape(tmp, [batch_size * num_test_images, -1])
+ new_modulation.append(tmp)
+
+ proposals2 = fluid.layers.reshape(
+ proposals2, [batch_size * num_test_images, -1, 4])
+
+ pred_iou = self.predict_iou(new_modulation, iou_feat, proposals2)
+ pred_iou = fluid.layers.reshape(pred_iou,
+ [num_test_images, batch_size, -1])
+ return pred_iou
+
+ def get_filter(self, feat1, bb1):
+ """
+ get modulation feature [feature1, feature2] for the targets
+ :param feat1: variable, Backbone features from reference images. shapes (batch, feature_dim, H, W).
+ :param bb1: variable, Target boxes (x,y,w,h) in image coords in the reference samples. shapes (batch, 4).
+ :return:
+ """
+ feat3_r, feat4_r = feat1
+
+ c3_r = self.conv3_1r(feat3_r)
+
+ # Add batch_index to rois
+ batch_size = bb1.shape[0]
+ batch_roi_nums = np.array([1] * batch_size).astype(np.int64)
+ batch_roi_nums = fluid.dygraph.to_variable(batch_roi_nums)
+
+ # input bb is in format xywh, convert it to x0y0x1y1 format
+ roi1 = fluid.layers.concat(
+ [bb1[:, 0:2], bb1[:, 0:2] + bb1[:, 2:4]], axis=1)
+ roi1.stop_gradient = False
+
+ roi3r = fluid.layers.prroi_pool(c3_r, roi1, 1 / 8., 3, 3,
+ batch_roi_nums)
+
+ c4_r = self.conv4_1r(feat4_r)
+ roi4r = fluid.layers.prroi_pool(c4_r, roi1, 1 / 16., 1, 1,
+ batch_roi_nums)
+
+ fc3_r = self.fc3_1r(roi3r)
+
+ # Concatenate
+ fc34_r = fluid.layers.concat([fc3_r, roi4r], axis=1)
+
+ fc34_3_r = self.fc34_3r(fc34_r)
+ fc34_4_r = self.fc34_4r(fc34_r)
+
+ return fc34_3_r, fc34_4_r
+
+ def get_iou_feat(self, feat2):
+ """
+ Get IoU prediction features from a 4 or 5 dimensional backbone input.
+ :param feat2: variable, Backbone features from reference images. [feature1, feature2]
+ :return: features, variable
+ """
+ feat3_t, feat4_t = feat2
+ c3_t = self.conv3_2t(self.conv3_1t(feat3_t))
+ c4_t = self.conv4_2t(self.conv4_1t(feat4_t))
+
+ return c3_t, c4_t
+
+
+def atom_iounet(name,
+ input_dim=(128, 256),
+ pred_input_dim=(256, 256),
+ pred_inter_dim=(256, 256)):
+ return AtomIouNet(
+ name,
+ input_dim=input_dim,
+ pred_input_dim=pred_input_dim,
+ pred_inter_dim=pred_inter_dim)
+
+
+def test_paddle_iounet():
+ a = np.random.uniform(-1, 1, [1, 1, 512, 18, 18]).astype(np.float32)
+ b = np.random.uniform(-1, 1, [1, 1, 1024, 9, 9]).astype(np.float32)
+ bbox = [[3, 4, 10, 11]]
+ proposal_bbox = [[4, 5, 11, 12] * 16]
+ bbox = np.reshape(np.array(bbox), [1, 1, 4]).astype(np.float32)
+ proposal_bbox = np.reshape(np.array(proposal_bbox),
+ [1, 16, 4]).astype(np.float32)
+ with fluid.dygraph.guard():
+ a_pd = fluid.dygraph.to_variable(a)
+ b_pd = fluid.dygraph.to_variable(b)
+ bbox_pd = fluid.dygraph.to_variable(bbox)
+ proposal_bbox_pd = fluid.dygraph.to_variable(proposal_bbox)
+ feat1 = [a_pd, b_pd]
+ feat2 = [a_pd, b_pd]
+
+ model = AtomIouNet('IOUNet', input_dim=(512, 1024))
+ res = model(feat1, feat2, bbox_pd, proposal_bbox_pd)
+ print(res.shape)
+ params = model.state_dict()
+
+ for v in params:
+ print(v, '\t', params[v].shape)
+ print(len(params))
+
+
+if __name__ == '__main__':
+ test_paddle_iounet()
diff --git a/PaddleCV/tracking/ltr/models/siamese/__init__.py b/PaddleCV/tracking/ltr/models/siamese/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8adbdfc323c7a6031904b5acb269bb059a9fc92c
--- /dev/null
+++ b/PaddleCV/tracking/ltr/models/siamese/__init__.py
@@ -0,0 +1 @@
+from .target_estimator_net import SiamFCEstimator
diff --git a/PaddleCV/tracking/ltr/models/siamese/siam.py b/PaddleCV/tracking/ltr/models/siamese/siam.py
new file mode 100644
index 0000000000000000000000000000000000000000..058015fcc1d50c23b8708af6bb8381b0d8d642a8
--- /dev/null
+++ b/PaddleCV/tracking/ltr/models/siamese/siam.py
@@ -0,0 +1,64 @@
+from paddle import fluid
+from paddle.fluid import dygraph
+import ltr.models.siamese.target_estimator_net as tgt_estimator
+
+
+class SiamNet(dygraph.layers.Layer):
+ def __init__(self,
+ name,
+ feature_extractor,
+ target_estimator,
+ target_estimator_layer,
+ extractor_grad=True):
+ """
+
+ :param feature_extractor: backbone
+ :param target_estimator: headers
+ :param target_estimator_layer: list, which layer is used in header,
+ :param extractor_grad: default is True
+ """
+ super(SiamNet, self).__init__(name)
+
+ self.feature_extractor = feature_extractor
+ self.target_estimator = target_estimator
+ self.target_estimator_layer = target_estimator_layer
+
+ def forward(self, train_imgs, test_imgs):
+ # extract backbone features
+ if len(train_imgs.shape) == 5:
+ train_imgs = fluid.layers.reshape(
+ train_imgs, [-1, *list(train_imgs.shape)[-3:]])
+ test_imgs = fluid.layers.reshape(test_imgs,
+ [-1, *list(test_imgs.shape)[-3:]])
+
+ train_feat = self.extract_backbone_features(train_imgs)
+ test_feat = self.extract_backbone_features(test_imgs)
+
+ train_feat = [feat for feat in train_feat.values()]
+ test_feat = [feat for feat in test_feat.values()]
+
+ # Obtain target estimation
+ targets = self.target_estimator(train_feat, test_feat)
+ return targets
+
+ def extract_backbone_features(self, im, layers=None):
+ if layers is None:
+ layers = self.target_estimator_layer
+ return self.feature_extractor(im, layers)
+
+ def extract_features(self, im, layers):
+ return self.feature_extractor(im, layers)
+
+
+def siamfc_alexnet(backbone_pretrained=False,
+ backbone_is_test=False,
+ estimator_is_test=False):
+ from ltr.models.backbone.sfc_alexnet import SFC_AlexNet
+ backbone_net = SFC_AlexNet('AlexNet', is_test=backbone_is_test)
+ target_estimator = tgt_estimator.SiamFCEstimator('CenterEstimator')
+ model = SiamNet(
+ 'SiamFC',
+ backbone_net,
+ target_estimator,
+ ['conv5'], )
+ return model
diff --git a/PaddleCV/tracking/ltr/models/siamese/target_estimator_net.py b/PaddleCV/tracking/ltr/models/siamese/target_estimator_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4a676dd4594e4c17fdbf2b16eb92cb40ce451af
--- /dev/null
+++ b/PaddleCV/tracking/ltr/models/siamese/target_estimator_net.py
@@ -0,0 +1,47 @@
+from paddle import fluid
+from paddle.fluid import dygraph
+from paddle.fluid.dygraph import nn
+
+from pytracking.libs.Fconv2d import Conv2D
+
+
+class SiamFCEstimator(dygraph.layers.Layer):
+ def __init__(self, name):
+ super().__init__(name)
+ init_w = fluid.ParamAttr(
+ name="a_weight",
+ initializer=fluid.initializer.ConstantInitializer(0.001),
+ learning_rate=0.,
+ trainable=False)
+ init_b = fluid.ParamAttr(
+ name="a_bias",
+ initializer=fluid.initializer.ConstantInitializer(0.),
+ trainable=True)
+
+ self.adjust_conv = nn.Conv2D(
+ 1, 1, 1, 1, 0, param_attr=init_w, bias_attr=init_b)
+
+ def forward(self, exemplar, instance):
+ exemplar_f = self.get_reference(exemplar)
+ instance_f = self.get_search_feat(instance)
+ score_map = self.estimate(exemplar_f, instance_f)
+ return score_map
+
+ def get_reference(self, feat):
+ # remove list warp
+ return feat[0]
+
+ def get_search_feat(self, feat):
+ # remove list warp
+ return feat[0]
+
+ def estimate(self, exemplar, instance):
+ shape = instance.shape
+ instance = fluid.layers.reshape(
+ instance, shape=[1, -1, shape[2], shape[3]])
+
+ cross_conv = Conv2D(stride=1, padding=0, dilation=1, groups=shape[0])
+ score_map = cross_conv(instance, exemplar)
+ score_map = fluid.layers.transpose(score_map, [1, 0, 2, 3])
+ score_map = self.adjust_conv(score_map)
+ return score_map
diff --git a/PaddleCV/tracking/ltr/run_training.py b/PaddleCV/tracking/ltr/run_training.py
new file mode 100644
index 0000000000000000000000000000000000000000..e18fc03ff2738020036f642178948aba87dade0d
--- /dev/null
+++ b/PaddleCV/tracking/ltr/run_training.py
@@ -0,0 +1,60 @@
+import os
+import sys
+import argparse
+import importlib
+import multiprocessing
+import paddle
+import cv2 as cv
+
+env_path = os.path.join(os.path.dirname(__file__), '..')
+if env_path not in sys.path:
+ sys.path.append(env_path)
+
+import ltr.admin.settings as ws_settings
+
+
+def run_training(train_module, train_name):
+ """Run a train scripts in train_settings.
+ args:
+ train_module: Name of module in the "train_settings/" folder.
+ train_name: Name of the train settings file.
+ """
+ # set single threads in opencv
+ cv.setNumThreads(0)
+
+ print('Training: {} {}'.format(train_module, train_name))
+
+ settings = ws_settings.Settings()
+
+ if settings.env.workspace_dir == '':
+ raise Exception('Setup your workspace_dir in "ltr/admin/local.py".')
+
+ settings.module_name = train_module
+ settings.script_name = train_name
+ settings.project_path = 'ltr/{}/{}'.format(train_module, train_name)
+
+ expr_module = importlib.import_module('ltr.train_settings.{}.{}'.format(
+ train_module, train_name))
+ expr_func = getattr(expr_module, 'run')
+
+ expr_func(settings)
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description='Run a train scripts in train_settings.')
+ parser.add_argument(
+ 'train_module',
+ type=str,
+ help='Name of module in the "train_settings/" folder.')
+ parser.add_argument(
+ 'train_name', type=str, help='Name of the train settings file.')
+
+ args = parser.parse_args()
+
+ run_training(args.train_module, args.train_name)
+
+
+if __name__ == '__main__':
+ multiprocessing.set_start_method('spawn', force=True)
+ main()
diff --git a/PaddleCV/tracking/ltr/train_settings/bbreg/atom_res18_vid_lasot_coco.py b/PaddleCV/tracking/ltr/train_settings/bbreg/atom_res18_vid_lasot_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..688d9c81ad34a947758deea3e0e173344704ef67
--- /dev/null
+++ b/PaddleCV/tracking/ltr/train_settings/bbreg/atom_res18_vid_lasot_coco.py
@@ -0,0 +1,148 @@
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dygraph
+
+import ltr.actors as actors
+import ltr.data.transforms as dltransforms
+from ltr.data import processing, sampler, loader
+from ltr.dataset import ImagenetVID, MSCOCOSeq, Lasot, Got10k
+from ltr.models.bbreg.atom import atom_resnet50, atom_resnet18
+from ltr.trainers import LTRTrainer
+
+
+def run(settings):
+ # Most common settings are assigned in the settings struct
+ settings.description = 'ATOM IoUNet with ResNet18 backbone and trained with vid, lasot, coco.'
+ settings.print_interval = 1 # How often to print loss and other info
+ settings.batch_size = 64 # Batch size
+ settings.num_workers = 4 # Number of workers for image loading
+ settings.normalize_mean = [0.485, 0.456, 0.406
+ ] # Normalize mean (default ImageNet values)
+ settings.normalize_std = [0.229, 0.224,
+ 0.225] # Normalize std (default ImageNet values)
+ settings.search_area_factor = 5.0 # Image patch size relative to target size
+ settings.feature_sz = 18 # Size of feature map
+ settings.output_sz = settings.feature_sz * 16 # Size of input image patches
+
+ # Settings for the image sample and proposal generation
+ settings.center_jitter_factor = {'train': 0, 'test': 4.5}
+ settings.scale_jitter_factor = {'train': 0, 'test': 0.5}
+ settings.proposal_params = {
+ 'min_iou': 0.1,
+ 'boxes_per_frame': 16,
+ 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3]
+ }
+
+ # Train datasets
+ vid_train = ImagenetVID()
+ lasot_train = Lasot(split='train')
+ coco_train = MSCOCOSeq()
+
+ # Validation datasets
+ got10k_val = Got10k(split='val')
+
+ # The joint augmentation transform, that is applied to the pairs jointly
+ transform_joint = dltransforms.ToGrayscale(probability=0.05)
+
+ # The augmentation transform applied to the training set (individually to each image in the pair)
+ transform_train = dltransforms.Compose([
+ dltransforms.ToArrayAndJitter(0.2), dltransforms.Normalize(
+ mean=settings.normalize_mean, std=settings.normalize_std)
+ ])
+
+ # The augmentation transform applied to the validation set (individually to each image in the pair)
+ transform_val = dltransforms.Compose([
+ dltransforms.ToArray(), dltransforms.Normalize(
+ mean=settings.normalize_mean, std=settings.normalize_std)
+ ])
+
+ # Data processing to do on the training pairs
+ data_processing_train = processing.ATOMProcessing(
+ search_area_factor=settings.search_area_factor,
+ output_sz=settings.output_sz,
+ center_jitter_factor=settings.center_jitter_factor,
+ scale_jitter_factor=settings.scale_jitter_factor,
+ mode='sequence',
+ proposal_params=settings.proposal_params,
+ transform=transform_train,
+ joint_transform=transform_joint)
+
+ # Data processing to do on the validation pairs
+ data_processing_val = processing.ATOMProcessing(
+ search_area_factor=settings.search_area_factor,
+ output_sz=settings.output_sz,
+ center_jitter_factor=settings.center_jitter_factor,
+ scale_jitter_factor=settings.scale_jitter_factor,
+ mode='sequence',
+ proposal_params=settings.proposal_params,
+ transform=transform_val,
+ joint_transform=transform_joint)
+
+ # The sampler for training
+ dataset_train = sampler.ATOMSampler(
+ [vid_train, lasot_train, coco_train], [1, 1, 1],
+ samples_per_epoch=1000 * settings.batch_size,
+ max_gap=50,
+ processing=data_processing_train)
+
+ # The loader for training
+ train_loader = loader.LTRLoader(
+ 'train',
+ dataset_train,
+ training=True,
+ batch_size=settings.batch_size,
+ num_workers=4,
+ stack_dim=1)
+
+ # The sampler for validation
+ dataset_val = sampler.ATOMSampler(
+ [got10k_val], [1, ],
+ samples_per_epoch=500 * settings.batch_size,
+ max_gap=50,
+ processing=data_processing_val)
+
+ # The loader for validation
+ val_loader = loader.LTRLoader(
+ 'val',
+ dataset_val,
+ training=False,
+ batch_size=settings.batch_size,
+ epoch_interval=5,
+ num_workers=4,
+ stack_dim=1)
+
+ # creat network, set objective, creat optimizer, learning rate scheduler, trainer
+ with dygraph.guard():
+ # Create network
+ net = atom_resnet18(backbone_pretrained=True)
+
+ # Freeze backbone
+ state_dicts = net.state_dict()
+ for k in state_dicts.keys():
+ if 'feature_extractor' in k and "running" not in k:
+ state_dicts[k].stop_gradient = True
+
+ # Set objective
+ objective = fluid.layers.square_error_cost
+
+ # Create actor, which wraps network and objective
+ actor = actors.AtomActor(net=net, objective=objective)
+
+ # Set to training mode
+ actor.train()
+
+ # define optimizer and learning rate
+ gama = 0.2
+ lr = 1e-3
+ lr_scheduler = fluid.dygraph.PiecewiseDecay(
+ [15, 30, 45],
+ values=[lr, lr * gama, lr * gama * gama],
+ step=1000,
+ begin=0)
+
+ optimizer = fluid.optimizer.Adam(
+ parameter_list=net.bb_regressor.parameters(),
+ learning_rate=lr_scheduler)
+
+ trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer,
+ settings, lr_scheduler)
+ trainer.train(40, load_latest=False, fail_safe=False)
diff --git a/PaddleCV/tracking/ltr/train_settings/bbreg/atom_res50_vid_lasot_coco.py b/PaddleCV/tracking/ltr/train_settings/bbreg/atom_res50_vid_lasot_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b4f09bcf361ca539e4f0f3a2754300918df8c33
--- /dev/null
+++ b/PaddleCV/tracking/ltr/train_settings/bbreg/atom_res50_vid_lasot_coco.py
@@ -0,0 +1,148 @@
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dygraph
+
+import ltr.actors as actors
+import ltr.data.transforms as dltransforms
+from ltr.data import processing, sampler, loader
+from ltr.dataset import ImagenetVID, MSCOCOSeq, Lasot, Got10k
+from ltr.models.bbreg.atom import atom_resnet50, atom_resnet18
+from ltr.trainers import LTRTrainer
+
+
+def run(settings):
+ # Most common settings are assigned in the settings struct
+ settings.description = 'ATOM IoUNet with ResNet50 backbone and trained with vid, lasot, coco.'
+ settings.print_interval = 1 # How often to print loss and other info
+ settings.batch_size = 64 # Batch size
+ settings.num_workers = 4 # Number of workers for image loading
+ settings.normalize_mean = [0.485, 0.456, 0.406
+ ] # Normalize mean (default ImageNet values)
+ settings.normalize_std = [0.229, 0.224,
+ 0.225] # Normalize std (default ImageNet values)
+ settings.search_area_factor = 5.0 # Image patch size relative to target size
+ settings.feature_sz = 18 # Size of feature map
+ settings.output_sz = settings.feature_sz * 16 # Size of input image patches
+
+ # Settings for the image sample and proposal generation
+ settings.center_jitter_factor = {'train': 0, 'test': 4.5}
+ settings.scale_jitter_factor = {'train': 0, 'test': 0.5}
+ settings.proposal_params = {
+ 'min_iou': 0.1,
+ 'boxes_per_frame': 16,
+ 'sigma_factor': [0.01, 0.05, 0.1, 0.2, 0.3]
+ }
+
+ # Train datasets
+ vid_train = ImagenetVID()
+ lasot_train = Lasot(split='train')
+ coco_train = MSCOCOSeq()
+
+ # Validation datasets
+ got10k_val = Got10k(split='val')
+
+ # The joint augmentation transform, that is applied to the pairs jointly
+ transform_joint = dltransforms.ToGrayscale(probability=0.05)
+
+ # The augmentation transform applied to the training set (individually to each image in the pair)
+ transform_train = dltransforms.Compose([
+ dltransforms.ToArrayAndJitter(0.2), dltransforms.Normalize(
+ mean=settings.normalize_mean, std=settings.normalize_std)
+ ])
+
+ # The augmentation transform applied to the validation set (individually to each image in the pair)
+ transform_val = dltransforms.Compose([
+ dltransforms.ToArray(), dltransforms.Normalize(
+ mean=settings.normalize_mean, std=settings.normalize_std)
+ ])
+
+ # Data processing to do on the training pairs
+ data_processing_train = processing.ATOMProcessing(
+ search_area_factor=settings.search_area_factor,
+ output_sz=settings.output_sz,
+ center_jitter_factor=settings.center_jitter_factor,
+ scale_jitter_factor=settings.scale_jitter_factor,
+ mode='sequence',
+ proposal_params=settings.proposal_params,
+ transform=transform_train,
+ joint_transform=transform_joint)
+
+ # Data processing to do on the validation pairs
+ data_processing_val = processing.ATOMProcessing(
+ search_area_factor=settings.search_area_factor,
+ output_sz=settings.output_sz,
+ center_jitter_factor=settings.center_jitter_factor,
+ scale_jitter_factor=settings.scale_jitter_factor,
+ mode='sequence',
+ proposal_params=settings.proposal_params,
+ transform=transform_val,
+ joint_transform=transform_joint)
+
+ # The sampler for training
+ dataset_train = sampler.ATOMSampler(
+ [vid_train, lasot_train, coco_train], [1, 1, 1],
+ samples_per_epoch=1000 * settings.batch_size,
+ max_gap=50,
+ processing=data_processing_train)
+
+ # The loader for training
+ train_loader = loader.LTRLoader(
+ 'train',
+ dataset_train,
+ training=True,
+ batch_size=settings.batch_size,
+ num_workers=4,
+ stack_dim=1)
+
+ # The sampler for validation
+ dataset_val = sampler.ATOMSampler(
+ [got10k_val], [1, ],
+ samples_per_epoch=500 * settings.batch_size,
+ max_gap=50,
+ processing=data_processing_val)
+
+ # The loader for validation
+ val_loader = loader.LTRLoader(
+ 'val',
+ dataset_val,
+ training=False,
+ batch_size=settings.batch_size,
+ num_workers=4,
+ epoch_interval=5,
+ stack_dim=1)
+
+ # creat network, set objective, creat optimizer, learning rate scheduler, trainer
+ with dygraph.guard():
+ # Create network
+ net = atom_resnet50(backbone_pretrained=True)
+
+ # Freeze backbone
+ state_dicts = net.state_dict()
+ for k in state_dicts.keys():
+ if 'feature_extractor' in k and "running" not in k:
+ state_dicts[k].stop_gradient = True
+
+ # Set objective
+ objective = fluid.layers.square_error_cost
+
+ # Create actor, which wraps network and objective
+ actor = actors.AtomActor(net=net, objective=objective)
+
+ # Set to training mode
+ actor.train()
+
+ # define optimizer and learning rate
+ gama = 0.2
+ lr = 1e-3
+ lr_scheduler = fluid.dygraph.PiecewiseDecay(
+ [15, 30, 45],
+ values=[lr, lr * gama, lr * gama * gama],
+ step=1000,
+ begin=0)
+
+ optimizer = fluid.optimizer.Adam(
+ parameter_list=net.bb_regressor.parameters(),
+ learning_rate=lr_scheduler)
+
+ trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer,
+ settings, lr_scheduler)
+ trainer.train(40, load_latest=False, fail_safe=False)
diff --git a/PaddleCV/tracking/ltr/train_settings/siamfc/siamfc_alexnet_vid.py b/PaddleCV/tracking/ltr/train_settings/siamfc/siamfc_alexnet_vid.py
new file mode 100644
index 0000000000000000000000000000000000000000..03c5826ec7a351fb7097964e45afd63476fc51d2
--- /dev/null
+++ b/PaddleCV/tracking/ltr/train_settings/siamfc/siamfc_alexnet_vid.py
@@ -0,0 +1,181 @@
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dygraph
+
+import ltr.actors as actors
+import ltr.data.transforms as dltransforms
+from ltr.data import processing, sampler, loader
+from ltr.dataset import ImagenetVID, Got10k
+from ltr.models.siamese.siam import siamfc_alexnet
+from ltr.trainers import LTRTrainer
+import numpy as np
+import cv2 as cv
+from PIL import Image, ImageEnhance
+
+
+class DataAug(dltransforms.Transform):
+ def __init__(self):
+ pass
+
+ def random_blur(self, img):
+ k = np.random.choice([3, 5, 7])
+ return cv.GaussianBlur(img, (k, k), sigmaX=0, sigmaY=0)
+
+ def brightness(self, img):
+ img = Image.fromarray(img.astype('uint8'))
+ enh_bri = ImageEnhance.Brightness(img)
+ brightness = np.random.choice(np.linspace(0.5, 1.25, 4))
+ img_brighted = enh_bri.enhance(brightness)
+
+ return np.array(img_brighted)
+
+ def contrast(self, img):
+ img = Image.fromarray(img.astype('uint8'))
+ enh_con = ImageEnhance.Contrast(img)
+ contrast = np.random.choice(np.linspace(0.5, 1.25, 4))
+ image_contrasted = enh_con.enhance(contrast)
+
+ return np.array(image_contrasted)
+
+ def no_aug(self, img):
+ return img
+
+ def flip(self, img):
+ return cv.flip(img, 1)
+
+ def transform(self, img, *args):
+ func = np.random.choice(
+ [self.contrast, self.random_blur, self.brightness, self.flip])
+ return func(img)
+
+
+def run(settings):
+ # Most common settings are assigned in the settings struct
+ settings.description = 'SiamFC with Alexnet backbone and trained with vid'
+ settings.print_interval = 100 # How often to print loss and other info
+ settings.batch_size = 8 # Batch size
+ settings.num_workers = 8 # Number of workers for image loading
+ settings.normalize_mean = [0., 0., 0.] # Normalize mean
+ settings.normalize_std = [1 / 255., 1 / 255., 1 / 255.] # Normalize std
+ settings.search_area_factor = {
+ 'train': 1.0,
+ 'test': 2.0078740157480315
+ } # roughly the same as SiamFC
+ settings.output_sz = {'train': 127, 'test': 255}
+ settings.scale_type = 'context'
+ settings.border_type = 'meanpad'
+
+ # Settings for the image sample and proposal generation
+ settings.center_jitter_factor = {'train': 0, 'test': 0}
+ settings.scale_jitter_factor = {'train': 0, 'test': 0.}
+
+ # Train datasets
+ vid_train = ImagenetVID()
+
+ # Validation datasets
+ got10k_val = vid_train #Got10k(split='val')
+
+ # The joint augmentation transform, that is applied to the pairs jointly
+ transform_joint = dltransforms.ToGrayscale(probability=0.25)
+
+ # The augmentation transform applied to the training set (individually to each image in the pair)
+ transform_exemplar = dltransforms.Compose([
+ dltransforms.ToArray(), dltransforms.Normalize(
+ mean=settings.normalize_mean, std=settings.normalize_std)
+ ])
+ transform_instance = dltransforms.Compose([
+ DataAug(), dltransforms.ToArray(), dltransforms.Normalize(
+ mean=settings.normalize_mean, std=settings.normalize_std)
+ ])
+
+ # Data processing to do on the training pairs
+ data_processing_train = processing.SiamFCProcessing(
+ search_area_factor=settings.search_area_factor,
+ output_sz=settings.output_sz,
+ center_jitter_factor=settings.center_jitter_factor,
+ scale_jitter_factor=settings.scale_jitter_factor,
+ scale_type=settings.scale_type,
+ border_type=settings.border_type,
+ mode='sequence',
+ train_transform=transform_exemplar,
+ test_transform=transform_instance,
+ joint_transform=transform_joint)
+
+ # Data processing to do on the validation pairs
+ data_processing_val = processing.SiamFCProcessing(
+ search_area_factor=settings.search_area_factor,
+ output_sz=settings.output_sz,
+ center_jitter_factor=settings.center_jitter_factor,
+ scale_jitter_factor=settings.scale_jitter_factor,
+ scale_type=settings.scale_type,
+ border_type=settings.border_type,
+ mode='sequence',
+ transform=transform_exemplar,
+ joint_transform=transform_joint)
+
+ # The sampler for training
+ dataset_train = sampler.ATOMSampler(
+ [vid_train], [1, ],
+ samples_per_epoch=6650 * settings.batch_size,
+ max_gap=100,
+ processing=data_processing_train)
+
+ # The loader for training
+ train_loader = loader.LTRLoader(
+ 'train',
+ dataset_train,
+ training=True,
+ batch_size=settings.batch_size,
+ num_workers=settings.num_workers,
+ stack_dim=1)
+
+ # The sampler for validation
+ dataset_val = sampler.ATOMSampler(
+ [got10k_val], [1, ],
+ samples_per_epoch=1000 * settings.batch_size,
+ max_gap=100,
+ processing=data_processing_val)
+
+ # The loader for validation
+ val_loader = loader.LTRLoader(
+ 'val',
+ dataset_val,
+ training=False,
+ batch_size=settings.batch_size,
+ num_workers=settings.num_workers,
+ epoch_interval=5,
+ stack_dim=1)
+
+ # creat network, set objective, creat optimizer, learning rate scheduler, trainer
+ with dygraph.guard():
+ # Create network
+ net = siamfc_alexnet()
+
+ # Create actor, which wraps network and objective
+ actor = actors.SiamFCActor(
+ net=net,
+ objective=None,
+ batch_size=settings.batch_size,
+ shape=(17, 17),
+ radius=16,
+ stride=8)
+
+ # Set to training mode
+ actor.train()
+
+ # define optimizer and learning rate
+ lr_scheduler = fluid.layers.exponential_decay(
+ learning_rate=0.01,
+ decay_steps=6650,
+ decay_rate=0.8685,
+ staircase=True)
+ regularizer = fluid.regularizer.L2DecayRegularizer(
+ regularization_coeff=0.0005)
+ optimizer = fluid.optimizer.Momentum(
+ momentum=0.9,
+ regularization=regularizer,
+ parameter_list=net.parameters(),
+ learning_rate=lr_scheduler)
+
+ trainer = LTRTrainer(actor, [train_loader], optimizer, settings,
+ lr_scheduler)
+ trainer.train(50, load_latest=False, fail_safe=False)
diff --git a/PaddleCV/tracking/ltr/train_settings/siamfc/siamfc_alexnet_vid_replicate.py b/PaddleCV/tracking/ltr/train_settings/siamfc/siamfc_alexnet_vid_replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ca4695a3232b55fcdb226bd34358903bbcabc32
--- /dev/null
+++ b/PaddleCV/tracking/ltr/train_settings/siamfc/siamfc_alexnet_vid_replicate.py
@@ -0,0 +1,181 @@
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dygraph
+
+import ltr.actors as actors
+import ltr.data.transforms as dltransforms
+from ltr.data import processing, sampler, loader
+from ltr.dataset import ImagenetVID, Got10k
+from ltr.models.siamese.siam import siamfc_alexnet
+from ltr.trainers import LTRTrainer
+import numpy as np
+import cv2 as cv
+from PIL import Image, ImageEnhance
+
+
+class DataAug(dltransforms.Transform):
+ def __init__(self):
+ pass
+
+ def random_blur(self, img):
+ k = np.random.choice([3, 5, 7])
+ return cv.GaussianBlur(img, (k, k), sigmaX=0, sigmaY=0)
+
+ def brightness(self, img):
+ img = Image.fromarray(img.astype('uint8'))
+ enh_bri = ImageEnhance.Brightness(img)
+ brightness = np.random.choice(np.linspace(0.5, 1.25, 4))
+ img_brighted = enh_bri.enhance(brightness)
+
+ return np.array(img_brighted)
+
+ def contrast(self, img):
+ img = Image.fromarray(img.astype('uint8'))
+ enh_con = ImageEnhance.Contrast(img)
+ contrast = np.random.choice(np.linspace(0.5, 1.25, 4))
+ image_contrasted = enh_con.enhance(contrast)
+
+ return np.array(image_contrasted)
+
+ def no_aug(self, img):
+ return img
+
+ def flip(self, img):
+ return cv.flip(img, 1)
+
+ def transform(self, img, *args):
+ func = np.random.choice(
+ [self.contrast, self.random_blur, self.brightness, self.flip])
+ return func(img)
+
+
+def run(settings):
+ # Most common settings are assigned in the settings struct
+ settings.description = 'SiamFC with Alexnet backbone and trained with vid'
+ settings.print_interval = 1 # How often to print loss and other info
+ settings.batch_size = 8 # Batch size
+ settings.num_workers = 8 # Number of workers for image loading
+ settings.normalize_mean = [0., 0., 0.] # Normalize mean
+ settings.normalize_std = [1 / 255., 1 / 255., 1 / 255.] # Normalize std
+ settings.search_area_factor = {
+ 'train': 1.0,
+ 'test': 2.0078740157480315
+ } # roughly the same as SiamFC
+ settings.output_sz = {'train': 127, 'test': 255}
+ settings.scale_type = 'context'
+ settings.border_type = 'replicate'
+
+ # Settings for the image sample and proposal generation
+ settings.center_jitter_factor = {'train': 0, 'test': 0}
+ settings.scale_jitter_factor = {'train': 0, 'test': 0.}
+
+ # Train datasets
+ vid_train = ImagenetVID()
+
+ # Validation datasets
+ got10k_val = Got10k(split='val')
+
+ # The joint augmentation transform, that is applied to the pairs jointly
+ transform_joint = dltransforms.ToGrayscale(probability=0.25)
+
+ # The augmentation transform applied to the training set (individually to each image in the pair)
+ transform_exemplar = dltransforms.Compose([
+ dltransforms.ToArray(), dltransforms.Normalize(
+ mean=settings.normalize_mean, std=settings.normalize_std)
+ ])
+ transform_instance = dltransforms.Compose([
+ DataAug(), dltransforms.ToArray(), dltransforms.Normalize(
+ mean=settings.normalize_mean, std=settings.normalize_std)
+ ])
+
+ # Data processing to do on the training pairs
+ data_processing_train = processing.SiamFCProcessing(
+ search_area_factor=settings.search_area_factor,
+ output_sz=settings.output_sz,
+ center_jitter_factor=settings.center_jitter_factor,
+ scale_jitter_factor=settings.scale_jitter_factor,
+ scale_type=settings.scale_type,
+ border_type=settings.border_type,
+ mode='sequence',
+ train_transform=transform_exemplar,
+ test_transform=transform_instance,
+ joint_transform=transform_joint)
+
+ # Data processing to do on the validation pairs
+ data_processing_val = processing.SiamFCProcessing(
+ search_area_factor=settings.search_area_factor,
+ output_sz=settings.output_sz,
+ center_jitter_factor=settings.center_jitter_factor,
+ scale_jitter_factor=settings.scale_jitter_factor,
+ scale_type=settings.scale_type,
+ border_type=settings.border_type,
+ mode='sequence',
+ transform=transform_exemplar,
+ joint_transform=transform_joint)
+
+ # The sampler for training
+ dataset_train = sampler.ATOMSampler(
+ [vid_train], [1, ],
+ samples_per_epoch=6650 * settings.batch_size,
+ max_gap=100,
+ processing=data_processing_train)
+
+ # The loader for training
+ train_loader = loader.LTRLoader(
+ 'train',
+ dataset_train,
+ training=True,
+ batch_size=settings.batch_size,
+ num_workers=settings.num_workers,
+ stack_dim=1)
+
+ # The sampler for validation
+ dataset_val = sampler.ATOMSampler(
+ [got10k_val], [1, ],
+ samples_per_epoch=1000 * settings.batch_size,
+ max_gap=100,
+ processing=data_processing_val)
+
+ # The loader for validation
+ val_loader = loader.LTRLoader(
+ 'val',
+ dataset_val,
+ training=False,
+ batch_size=settings.batch_size,
+ num_workers=settings.num_workers,
+ epoch_interval=5,
+ stack_dim=1)
+
+ # creat network, set objective, creat optimizer, learning rate scheduler, trainer
+ with dygraph.guard():
+ # Create network
+ net = siamfc_alexnet()
+
+ # Create actor, which wraps network and objective
+ actor = actors.SiamFCActor(
+ net=net,
+ objective=None,
+ batch_size=settings.batch_size,
+ shape=(17, 17),
+ radius=16,
+ stride=8)
+
+ # Set to training mode
+ actor.train()
+
+ # define optimizer and learning rate
+ lr_scheduler = fluid.layers.exponential_decay(
+ learning_rate=0.01,
+ decay_steps=6650,
+ decay_rate=0.8685,
+ staircase=True)
+ regularizer = fluid.regularizer.L2DecayRegularizer(
+ regularization_coeff=0.0005)
+ optimizer = fluid.optimizer.Momentum(
+ momentum=0.9,
+ regularization=regularizer,
+ parameter_list=net.parameters(),
+ learning_rate=lr_scheduler)
+
+ trainer = LTRTrainer(actor, [train_loader, val_loader], optimizer,
+ settings, lr_scheduler)
+ trainer.train(50, load_latest=False, fail_safe=False)
diff --git a/PaddleCV/tracking/ltr/trainers/__init__.py b/PaddleCV/tracking/ltr/trainers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..33ff4417a25665f5d92481aef449f8d7b2386e13
--- /dev/null
+++ b/PaddleCV/tracking/ltr/trainers/__init__.py
@@ -0,0 +1,2 @@
+from .base_trainer import BaseTrainer
+from .ltr_trainer import LTRTrainer
diff --git a/PaddleCV/tracking/ltr/trainers/base_trainer.py b/PaddleCV/tracking/ltr/trainers/base_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..99a206d96a9ef4b0a819150f8c9e589e542deeaf
--- /dev/null
+++ b/PaddleCV/tracking/ltr/trainers/base_trainer.py
@@ -0,0 +1,156 @@
+import os
+import glob
+from paddle import fluid
+from paddle.fluid import dygraph
+import pickle
+
+
+class BaseTrainer:
+ """Base trainer class. Contains functions for training and saving/loading chackpoints.
+ Trainer classes should inherit from this one and overload the train_epoch function."""
+
+ def __init__(self, actor, loaders, optimizer, settings, lr_scheduler=None):
+ """
+ args:
+ actor - The actor for training the network
+ loaders - list of dataset loaders, e.g. [train_loader, val_loader]. In each epoch, the trainer runs one
+ epoch for each loader.
+ optimizer - The optimizer used for training, e.g. Adam
+ settings - Training settings
+ lr_scheduler - Learning rate scheduler
+ """
+ self.actor = actor
+ self.optimizer = optimizer
+ self.lr_scheduler = lr_scheduler
+ self.loaders = loaders
+
+ self.update_settings(settings)
+
+ self.epoch = 0
+ self.stats = {}
+
+ def update_settings(self, settings=None):
+ """Updates the trainer settings. Must be called to update internal settings."""
+ if settings is not None:
+ self.settings = settings
+
+ if self.settings.env.workspace_dir is not None:
+ self.settings.env.workspace_dir = os.path.expanduser(
+ self.settings.env.workspace_dir)
+ self._checkpoint_dir = os.path.join(self.settings.env.workspace_dir,
+ 'checkpoints')
+ if not os.path.exists(self._checkpoint_dir):
+ os.makedirs(self._checkpoint_dir)
+ else:
+ self._checkpoint_dir = None
+
+ def train(self, max_epochs, load_latest=False, fail_safe=True):
+ """Do training for the given number of epochs.
+ args:
+ max_epochs - Max number of training epochs,
+ load_latest - Bool indicating whether to resume from latest epoch.
+ fail_safe - Bool indicating whether the training to automatically restart in case of any crashes.
+ """
+
+ num_tries = 10
+ for i in range(num_tries):
+ try:
+ if load_latest:
+ self.load_checkpoint()
+
+ for epoch in range(self.epoch + 1, max_epochs + 1):
+ self.epoch = epoch
+ self.train_epoch()
+
+ if self._checkpoint_dir:
+ self.save_checkpoint()
+ except:
+ print('Training crashed at epoch {}'.format(self.epoch))
+ if fail_safe:
+ load_latest = True
+ print('Restarting training from last epoch ...')
+ else:
+ raise
+
+ print('Finished training!')
+
+ def train_epoch(self):
+ raise NotImplementedError
+
+ def save_checkpoint(self):
+ """Saves a checkpoint of the network and other variables."""
+ actor_type = type(self.actor).__name__
+ net_type = type(self.actor.net).__name__
+ state = {
+ 'epoch': self.epoch,
+ 'actor_type': actor_type,
+ 'net_type': net_type,
+ 'net_info': getattr(self.actor.net, 'info', None),
+ 'constructor': getattr(self.actor.net, 'constructor', None),
+ 'stats': self.stats,
+ 'settings': self.settings
+ }
+
+ directory = '{}/{}/{}_ep{:04d}'.format(self._checkpoint_dir,
+ self.settings.project_path,
+ net_type, self.epoch)
+ if not os.path.exists(directory):
+ os.makedirs(directory)
+
+ fluid.save_dygraph(self.actor.net.state_dict(), directory)
+ fluid.save_dygraph(self.optimizer.state_dict(), directory)
+ with open(os.path.join(directory, '_custom_state.pickle'), 'wb') as f:
+ pickle.dump(state, f)
+
+ def load_checkpoint(self, checkpoint=None):
+ """Loads a network checkpoint file.
+
+ Can be called in three different ways:
+ load_checkpoint():
+ Loads the latest epoch from the workspace. Use this to continue training.
+ load_checkpoint(epoch_num):
+ Loads the network at the given epoch number (int).
+ load_checkpoint(path_to_checkpoint):
+ Loads the file from the given absolute path (str).
+ """
+
+ net_type = type(self.actor.net).__name__
+
+ if checkpoint is None:
+ # Load most recent checkpoint
+ checkpoint_list = sorted(
+ glob.glob('{}/{}/{}_ep*'.format(self._checkpoint_dir,
+ self.settings.project_path,
+ net_type)))
+ if checkpoint_list:
+ checkpoint_path = checkpoint_list[-1].split('.')[0]
+ else:
+ print('No matching checkpoint file found')
+ return
+ elif isinstance(checkpoint, int):
+ # Checkpoint is the epoch number
+ checkpoint_path = '{}/{}/{}_ep{:04d}'.format(
+ self._checkpoint_dir, self.settings.project_path, net_type,
+ checkpoint)
+ elif isinstance(checkpoint, str):
+ # checkpoint is the path
+ checkpoint_path = os.path.expanduser(checkpoint)
+ else:
+ raise TypeError
+
+ # paddle load network
+ net_params, opt_params = fluid.load_dygraph(checkpoint_path)
+ self.actor.net.load_dict(net_params)
+ self.optimizer.set_dict(opt_params)
+
+ # paddle load state
+ state_path = '{}/{}/custom_state.pickle'.format(
+ self._checkpoint_dir, self.settings.project_path)
+ current_state = pickle.load(
+ open(os.path.join(state_path, 'custom_state.pickle'), 'rb'))
+
+ print("\nload checkpoint done !! Current states are as follows:")
+ for key, value in enumerate(current_state):
+ print(key, value)
+
+ return True
diff --git a/PaddleCV/tracking/ltr/trainers/ltr_trainer.py b/PaddleCV/tracking/ltr/trainers/ltr_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8b0380827493d0114e45c3f08e0f03bef827d01
--- /dev/null
+++ b/PaddleCV/tracking/ltr/trainers/ltr_trainer.py
@@ -0,0 +1,164 @@
+import os
+from collections import OrderedDict
+
+from ltr.trainers import BaseTrainer
+from ltr.admin.stats import AverageMeter, StatValue
+from ltr.admin.tensorboard import TensorboardWriter
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dygraph
+import time
+import numpy as np
+
+
+class LTRTrainer(BaseTrainer):
+ def __init__(self, actor, loaders, optimizer, settings, lr_scheduler=None):
+ """
+ args:
+ actor - The actor for training the network
+ loaders - list of dataset loaders, e.g. [train_loader, val_loader]. In each epoch, the trainer runs one
+ epoch for each loader.
+ optimizer - The optimizer used for training, e.g. Adam
+ settings - Training settings
+ lr_scheduler - Learning rate scheduler
+ """
+ super().__init__(actor, loaders, optimizer, settings, lr_scheduler)
+
+ self._set_default_settings()
+
+ # Initialize statistics variables
+ self.stats = OrderedDict({loader.name: None for loader in self.loaders})
+
+ # Initialize tensorboard
+ tensorboard_writer_dir = os.path.join(self.settings.env.tensorboard_dir,
+ self.settings.project_path)
+ self.tensorboard_writer = TensorboardWriter(tensorboard_writer_dir,
+ [l.name for l in loaders])
+
+ def _set_default_settings(self):
+ # Dict of all default values
+ default = {'print_interval': 10, 'print_stats': None, 'description': ''}
+
+ for param, default_value in default.items():
+ if getattr(self.settings, param, None) is None:
+ setattr(self.settings, param, default_value)
+
+ def cycle_dataset(self, loader):
+ """Do a cycle of training or validation."""
+ if loader.training:
+ self.actor.train()
+ else:
+ self.actor.eval()
+
+ self._init_timing()
+
+ for i, data in enumerate(loader, 1):
+ # get inputs
+ data = self.to_variable(data)
+ data['epoch'] = self.epoch
+ data['settings'] = self.settings
+
+ # forward pass
+ loss, stats = self.actor(data)
+
+ # backward pass and update weights
+ if loader.training:
+ loss.backward()
+ apply_collective_grads = getattr(self.actor.net,
+ "apply_collective_grads", None)
+ if callable(apply_collective_grads):
+ apply_collective_grads()
+ self.optimizer.minimize(loss)
+ self.actor.net.clear_gradients()
+
+ # update statistics
+ batch_size = data['train_images'].shape[loader.stack_dim]
+ self._update_stats(stats, batch_size, loader)
+
+ self._print_stats(i, loader, batch_size)
+
+ if i % loader.__len__() == 0:
+ self.save_checkpoint()
+ self._stats_new_epoch()
+ self._write_tensorboard()
+ return
+
+ def to_variable(self, data_dict):
+ keys = data_dict.keys()
+ for k in keys:
+ if k != "dataset":
+ data_dict[k] = dygraph.to_variable(
+ np.array(data_dict[k]).astype(np.float32))
+ return data_dict
+
+ def to_array(self, data_dict):
+ keys = data_dict.keys()
+ for k in keys:
+ if k != "dataset":
+ data_dict[k] = data_dict[k].numpy()
+ return data_dict
+
+ def train_epoch(self):
+ """Do one epoch for each loader."""
+ for loader in self.loaders:
+ if self.epoch % loader.epoch_interval == 0:
+ self.cycle_dataset(loader)
+
+ self._stats_new_epoch()
+ self._write_tensorboard()
+ print('{}th epoch train / eval done!'.format(self.epoch))
+
+ def _init_timing(self):
+ self.num_frames = 0
+ self.start_time = time.time()
+ self.prev_time = self.start_time
+
+ def _update_stats(self, new_stats: OrderedDict, batch_size, loader):
+ # Initialize stats if not initialized yet
+ if loader.name not in self.stats.keys() or self.stats[
+ loader.name] is None:
+ self.stats[loader.name] = OrderedDict(
+ {name: AverageMeter()
+ for name in new_stats.keys()})
+
+ for name, val in new_stats.items():
+ if name not in self.stats[loader.name].keys():
+ self.stats[loader.name][name] = AverageMeter()
+ self.stats[loader.name][name].update(val, batch_size)
+
+ def _print_stats(self, i, loader, batch_size):
+ self.num_frames += batch_size
+ current_time = time.time()
+ batch_fps = batch_size / (current_time - self.prev_time)
+ average_fps = self.num_frames / (current_time - self.start_time)
+ self.prev_time = current_time
+ if i % self.settings.print_interval == 0 or i == loader.__len__():
+ print_str = '[%s: %d, %d / %d] ' % (loader.name, self.epoch, i,
+ loader.__len__())
+ print_str += 'FPS: %.1f (%.1f) , ' % (average_fps, batch_fps)
+ for name, val in self.stats[loader.name].items():
+ if (self.settings.print_stats is None or
+ name in self.settings.print_stats) and hasattr(val,
+ 'avg'):
+ print_str += '%s: %.5f , ' % (name, val.avg)
+ print_str += '%s: %.5f , ' % ("time", batch_size / batch_fps *
+ self.settings.print_interval)
+ print(print_str[:-5])
+
+ def _stats_new_epoch(self):
+ for loader_stats in self.stats.values():
+ if loader_stats is None:
+ continue
+ for stat_value in loader_stats.values():
+ if hasattr(stat_value, 'new_epoch'):
+ stat_value.new_epoch()
+
+ def _write_tensorboard(self):
+ if self.epoch == 1:
+ self.tensorboard_writer.write_info(self.settings.module_name,
+ self.settings.script_name,
+ self.settings.description)
+
+ self.tensorboard_writer.write_epoch(self.stats, self.epoch)
+ print('{}/{}'.format(self.settings.module_name,
+ self.settings.script_name))
diff --git a/PaddleCV/tracking/pytracking/__init__.py b/PaddleCV/tracking/pytracking/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/PaddleCV/tracking/pytracking/admin/environment.py b/PaddleCV/tracking/pytracking/admin/environment.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32c103bc04a11774147f9abb3a953a123136fc8
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/admin/environment.py
@@ -0,0 +1,52 @@
+import importlib
+import os
+
+
+class EnvSettings:
+ def __init__(self):
+ pytracking_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+
+ self.results_path = '{}/tracking_results/'.format(pytracking_path)
+ self.network_path = '{}/networks/'.format(pytracking_path)
+ self.dataset_path = '{}/benchmark_datasets/'.format(pytracking_path)
+
+
+def create_default_local_file():
+ comment = {'results_path': 'Where to store tracking results',
+ 'dataset_path': 'Where benchmark datasets are stored',
+ 'network_path': 'Where tracking networks are stored.'}
+
+ path = os.path.join(os.path.dirname(__file__), 'local.py')
+ with open(path, 'w') as f:
+ settings = EnvSettings()
+
+ f.write('from pytracking.admin.environment import EnvSettings\n\n')
+ f.write('def local_env_settings():\n')
+ f.write(' settings = EnvSettings()\n\n')
+ f.write(' # Set your local paths here.\n\n')
+
+ for attr in dir(settings):
+ comment_str = None
+ if attr in comment:
+ comment_str = comment[attr]
+ attr_val = getattr(settings, attr)
+ if not attr.startswith('__') and not callable(attr_val):
+ if comment_str is None:
+ f.write(' settings.{} = \'{}\'\n'.format(attr, attr_val))
+ else:
+ f.write(' settings.{} = \'{}\' # {}\n'.format(attr, attr_val, comment_str))
+ f.write('\n return settings\n\n')
+
+
+def env_settings():
+ env_module_name = 'pytracking.admin.local'
+ try:
+ env_module = importlib.import_module(env_module_name)
+ return env_module.local_env_settings()
+ except:
+ env_file = os.path.join(os.path.dirname(__file__), 'local.py')
+
+ # Create a default file
+ create_default_local_file()
+ raise RuntimeError('YOU HAVE NOT SETUP YOUR local.py!!!\n Go to "{}" and set all the paths you need. '
+ 'Then try to run again.'.format(env_file))
diff --git a/PaddleCV/tracking/pytracking/admin/local.py b/PaddleCV/tracking/pytracking/admin/local.py
new file mode 100644
index 0000000000000000000000000000000000000000..40e8e23f3203e49a15fab9c890eef90300a8d445
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/admin/local.py
@@ -0,0 +1,13 @@
+from pytracking.admin.environment import EnvSettings
+
+
+def local_env_settings():
+ settings = EnvSettings()
+
+ # Set your local paths here.
+
+ settings.dataset_path = '' # Where benchmark datasets are stored
+ settings.network_path = '' # Where tracking networks are stored.
+ settings.results_path = '/models/PaddleCV/tracking/pytracking/tracking_results/' # Where to store tracking results
+
+ return settings
diff --git a/PaddleCV/tracking/pytracking/eval_benchmark.py b/PaddleCV/tracking/pytracking/eval_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf9536eb2944916d33e42cad35770b0043b38042
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/eval_benchmark.py
@@ -0,0 +1,308 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import importlib
+import os
+import os.path as osp
+import pickle
+import sys
+from glob import glob
+
+import cv2 as cv
+import numpy as np
+from tqdm import tqdm
+
+CURRENT_DIR = osp.dirname(__file__)
+sys.path.append(osp.join(CURRENT_DIR, '..'))
+
+from pytracking.admin.environment import env_settings
+from pytracking.pysot_toolkit.pysot.datasets import DatasetFactory
+from pytracking.pysot_toolkit.pysot.evaluation import EAOBenchmark, AccuracyRobustnessBenchmark, OPEBenchmark
+from pytracking.pysot_toolkit.pysot.utils.region import vot_overlap
+
+parser = argparse.ArgumentParser(description='tracking evaluation')
+
+parser.add_argument('--dataset', '-d', type=str, help='dataset name')
+parser.add_argument(
+ '--training_base_param', '-tr', type=str, help='training base params name')
+parser.add_argument('--epoch', '-e', type=str, help='epoch specifications')
+parser.add_argument(
+ '--tracking_base_param', '-te', type=str, help='tracking base params name')
+parser.add_argument(
+ '--num_repeat', '-n', default=1, type=int, help='number of repeat')
+parser.add_argument(
+ '--exp_id', '-ex', default='', type=str, help='experiment id')
+
+args = parser.parse_args()
+
+
+def read_image(x):
+ if isinstance(x, str):
+ img = cv.imread(x)
+ else:
+ img = x
+ return cv.cvtColor(img, cv.COLOR_BGR2RGB)
+
+
+def get_tracker_params(param_module, params):
+ tracker_params = param_module.parameters()
+ tracker_params.debug = 0 # disable debug
+ # change checkpoint path
+ tracker_params.features.features[0].net_path = params['checkpoint']
+ return tracker_params
+
+
+def create_tracker(params):
+ base_param = params['tracking_base_param']
+ base_tracker = base_param.split('.')[0]
+ param_module = importlib.import_module('pytracking.parameter.{}'.format(
+ base_param))
+ tracker_params = get_tracker_params(param_module, params)
+ tracker_module = importlib.import_module('pytracking.tracker.{}'.format(
+ base_tracker))
+ tracker_class = tracker_module.get_tracker_class()
+ return tracker_class(tracker_params)
+
+
+def get_axis_aligned_bbox(region):
+ region = np.array(region)
+ if len(region.shape) == 3:
+ # region (1,4,2)
+ region = np.array([
+ region[0][0][0], region[0][0][1], region[0][1][0], region[0][1][1],
+ region[0][2][0], region[0][2][1], region[0][3][0], region[0][3][1]
+ ])
+
+ cx = np.mean(region[0::2])
+ cy = np.mean(region[1::2])
+ x1 = min(region[0::2])
+
+ x2 = max(region[0::2])
+ y1 = min(region[1::2])
+ y2 = max(region[1::2])
+
+ A1 = np.linalg.norm(region[0:2] - region[2:4]) * np.linalg.norm(region[
+ 2:4] - region[4:6])
+ A2 = (x2 - x1) * (y2 - y1)
+ s = np.sqrt(A1 / A2)
+ w = s * (x2 - x1) + 1
+ h = s * (y2 - y1) + 1
+
+ x11 = cx - w // 2
+ y11 = cy - h // 2
+
+ return x11, y11, w, h
+
+
+def run_tracker(tracker, video, reset=False):
+ if reset:
+ frame_counter = 0
+ pred_bboxes = []
+ for idx, (img_p, gt_bbox) in enumerate(video):
+ if idx == frame_counter:
+ # init your tracker here
+ image = read_image(img_p)
+ if len(gt_bbox) == 8:
+ init_bbox = get_axis_aligned_bbox(gt_bbox)
+ else:
+ init_bbox = gt_bbox
+ tracker.initialize(image, init_bbox)
+ pred_bboxes.append(1)
+ elif idx > frame_counter:
+ # get tracking result here
+ image = read_image(img_p)
+ pred_bbox = tracker.track(image)
+ overlap = vot_overlap(pred_bbox, gt_bbox,
+ (image.shape[1], image.shape[0]))
+ if overlap > 0:
+ # continue tracking
+ pred_bboxes.append(pred_bbox)
+ else:
+ # lost target, restart
+ pred_bboxes.append(2)
+ frame_counter = idx + 5
+ else:
+ pred_bboxes.append(0)
+ else:
+ pred_bboxes = []
+ for idx, (img_p, gt_bbox) in enumerate(video):
+ if idx == 0:
+ # init your tracker here
+ image = read_image(img_p)
+ if len(gt_bbox) == 8:
+ init_bbox = get_axis_aligned_bbox(gt_bbox)
+ else:
+ init_bbox = gt_bbox
+ tracker.initialize(image, init_bbox)
+ pred_bboxes.append(init_bbox)
+ else:
+ # get tracking result here
+ image = read_image(img_p)
+ pred_bbox = tracker.track(image)
+ pred_bboxes.append(pred_bbox)
+ return pred_bboxes
+
+
+def run_one_sequence(video, params, tracker=None):
+ # idt = multiprocessing.current_process()._identity[0]
+ # os.environ["CUDA_VISIBLE_DEVICES"] = "{}".format(idt % 4)
+ save_dir = osp.join(params['result_dir'], params['save_dataset_name'],
+ params['tracking_base_param'], params['exp_id'])
+
+ if tracker is None:
+ tracker = create_tracker(params)
+
+ if 'VOT' in params['dataset_name']:
+ save_sub_dir = osp.join(save_dir, 'baseline', video.name)
+ os.makedirs(save_sub_dir, exist_ok=True)
+ num_repeat = params.get('num_repeat', 1)
+ for repeat_idx in range(1, num_repeat + 1):
+ save_path = osp.join(save_sub_dir,
+ video.name + '_{:03d}.txt'.format(repeat_idx))
+ if osp.exists(save_path): continue
+ pred_bboxes = run_tracker(tracker, video, reset=True)
+
+ # Save tracking results
+ with open(save_path, 'w') as f:
+ outputs = []
+ for res in pred_bboxes:
+ if isinstance(res, int):
+ outputs.append('{}'.format(res))
+ else:
+ outputs.append('{},{},{},{}'.format(res[0], res[1], res[
+ 2], res[3]))
+ f.write('\n'.join(outputs))
+ else:
+ os.makedirs(save_dir, exist_ok=True)
+ save_path = osp.join(save_dir, video.name + '.txt')
+ if osp.exists(save_path): return
+ pred_bboxes = run_tracker(tracker, video, reset=False)
+
+ # Save tracking results
+ with open(save_path, 'w') as f:
+ outputs = []
+ for res in pred_bboxes:
+ outputs.append('{},{},{},{}'.format(res[0], res[1], res[2], res[
+ 3]))
+ f.write('\n'.join(outputs))
+
+
+def run_one_dataset(dataset, params):
+ # use the same tracker for all sequences
+ tracker = create_tracker(params)
+ # create new tracker for each sequence
+ # tracker = None
+ for video in tqdm(list(dataset.videos.values())):
+ run_one_sequence(video, params, tracker=tracker)
+
+
+def compute_evaluation_metrics(dataset, params):
+ result_dir = osp.join(params['result_dir'], params['save_dataset_name'],
+ params['tracking_base_param'])
+ tracker_name = params['exp_id']
+ trackers = [tracker_name]
+ dataset.set_tracker(result_dir, trackers)
+
+ if 'VOT' in params['dataset_name']:
+ ar_benchmark = AccuracyRobustnessBenchmark(dataset)
+ ar_result = {}
+ ar_result.update(ar_benchmark.eval(trackers))
+
+ eao_benchmark = EAOBenchmark(dataset)
+ eao_result = {}
+ eao_result.update(eao_benchmark.eval(trackers))
+
+ ar_benchmark.show_result(ar_result, eao_result)
+ metrics = {'ar': ar_result, 'eao': eao_result}
+ else:
+ benchmark = OPEBenchmark(dataset)
+ success_result = {}
+ precision_result = {}
+ success_result.update(benchmark.eval_success(trackers))
+ precision_result.update(benchmark.eval_precision(trackers))
+ benchmark.show_result(success_result, precision_result)
+ metrics = {'success': success_result, 'precision': precision_result}
+ return metrics
+
+
+def save_info(params, metrics):
+ save_dir = osp.join(params['result_dir'], params['save_dataset_name'],
+ params['tracking_base_param'], params['exp_id'])
+ with open(osp.join(save_dir, 'params.pickle'), 'wb') as f:
+ pickle.dump(params, f)
+
+ with open(osp.join(save_dir, 'metrics.txt'), 'w') as f:
+ f.write('{}'.format(metrics))
+
+
+def run_tracking_and_evaluate(params):
+ """Receive hyperparameters and return the evaluation metric"""
+ # load dataset
+ root = os.path.abspath(
+ osp.join(env_settings().dataset_path, params['save_dataset_name']))
+ dataset = DatasetFactory.create_dataset(
+ name=params['dataset_name'], dataset_root=root)
+
+ run_one_dataset(dataset, params)
+ metrics = compute_evaluation_metrics(dataset, params)
+
+ return metrics
+
+
+def get_checkpoint_path(training_base_param, epoch):
+ model_dir = osp.abspath(
+ osp.join(env_settings().network_path, *training_base_param.split('.')))
+ model_names = glob(model_dir + '/*.pdparams')
+ prefix = '_'.join(model_names[0].split('_')[:-1])
+ return osp.join(model_dir, '{}_ep{:04d}'.format(prefix, epoch))
+
+
+def parse_epoch(epoch_str):
+ epochs = eval(epoch_str)
+ try:
+ iterator = iter(epochs)
+ except:
+ if isinstance(epochs, int):
+ iterator = [epochs]
+ else:
+ raise NotImplementedError
+ return iterator
+
+
+def main():
+ for epoch in parse_epoch(args.epoch):
+ # get checkpoint
+ checkpoint_pth = get_checkpoint_path(args.training_base_param, epoch)
+
+ if args.exp_id == '':
+ exp_id = args.training_base_param + '.epoch{}'.format(epoch)
+ else:
+ exp_id = args.exp_id
+ print('=> Evaluating: {}'.format(exp_id))
+
+ if args.dataset in ['CVPR13', 'OTB50', 'OTB100']:
+ # for OTB datasets, we save results into the same directory
+ save_dataset_name = 'OTB100'
+ else:
+ save_dataset_name = args.dataset
+
+ # set up parameters
+ params = {
+ 'dataset_name': args.dataset,
+ 'checkpoint': checkpoint_pth,
+ 'tracking_base_param': args.tracking_base_param,
+ 'num_repeat': args.num_repeat,
+ 'exp_id': exp_id,
+ 'result_dir': env_settings().results_path,
+ 'save_dataset_name': save_dataset_name,
+ }
+
+ metrics = run_tracking_and_evaluate(params)
+ save_info(params, metrics)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/PaddleCV/tracking/pytracking/features/__init__.py b/PaddleCV/tracking/pytracking/features/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/PaddleCV/tracking/pytracking/features/augmentation.py b/PaddleCV/tracking/pytracking/features/augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8171ec59784a4f7f791cf726dd277df88c3ac80b
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/features/augmentation.py
@@ -0,0 +1,205 @@
+import numpy as np
+import math
+
+from paddle.fluid import layers
+
+import cv2 as cv
+
+from pytracking.features.preprocessing import numpy_to_paddle, paddle_to_numpy
+from pytracking.libs.Fconv2d import Fconv2d
+from pytracking.libs.paddle_utils import PTensor, _padding, n2p
+
+
+class Transform:
+ """Base data augmentation transform class."""
+
+ def __init__(self, output_sz=None, shift=None):
+ self.output_sz = output_sz
+ self.shift = (0, 0) if shift is None else shift
+
+ def __call__(self, image):
+ raise NotImplementedError
+
+ def crop_to_output(self, image, shift=None):
+ if isinstance(image, PTensor):
+ imsz = image.shape[2:]
+ else:
+ imsz = image.shape[:2]
+
+ if self.output_sz is None:
+ pad_h = 0
+ pad_w = 0
+ else:
+ pad_h = (self.output_sz[0] - imsz[0]) / 2
+ pad_w = (self.output_sz[1] - imsz[1]) / 2
+ if shift is None:
+ shift = self.shift
+ pad_left = math.floor(pad_w) + shift[1]
+ pad_right = math.ceil(pad_w) - shift[1]
+ pad_top = math.floor(pad_h) + shift[0]
+ pad_bottom = math.ceil(pad_h) - shift[0]
+
+ if isinstance(image, PTensor):
+ return _padding(
+ image, (pad_left, pad_right, pad_top, pad_bottom),
+ mode='replicate')
+ else:
+ return _padding(
+ image, (0, 0, pad_left, pad_right, pad_top, pad_bottom),
+ mode='replicate')
+
+
+class Identity(Transform):
+ """Identity transformation."""
+
+ def __call__(self, image):
+ return self.crop_to_output(image)
+
+
+class FlipHorizontal(Transform):
+ """Flip along horizontal axis."""
+
+ def __call__(self, image):
+ if isinstance(image, PTensor):
+ return self.crop_to_output(layers.reverse(image, 3))
+ else:
+ return self.crop_to_output(np.fliplr(image))
+
+
+class FlipVertical(Transform):
+ """Flip along vertical axis."""
+
+ def __call__(self, image: PTensor):
+ if isinstance(image, PTensor):
+ return self.crop_to_output(layers.reverse(image, 2))
+ else:
+ return self.crop_to_output(np.flipud(image))
+
+
+class Translation(Transform):
+ """Translate."""
+
+ def __init__(self, translation, output_sz=None, shift=None):
+ super().__init__(output_sz, shift)
+ self.shift = (self.shift[0] + translation[0],
+ self.shift[1] + translation[1])
+
+ def __call__(self, image):
+ return self.crop_to_output(image)
+
+
+class Scale(Transform):
+ """Scale."""
+
+ def __init__(self, scale_factor, output_sz=None, shift=None):
+ super().__init__(output_sz, shift)
+ self.scale_factor = scale_factor
+
+ def __call__(self, image):
+ # Calculate new size. Ensure that it is even so that crop/pad becomes easier
+ h_orig, w_orig = image.shape[2:]
+
+ if h_orig != w_orig:
+ raise NotImplementedError
+
+ h_new = round(h_orig / self.scale_factor)
+ h_new += (h_new - h_orig) % 2
+ w_new = round(w_orig / self.scale_factor)
+ w_new += (w_new - w_orig) % 2
+
+ if isinstance(image, PTensor):
+ image_resized = layers.resize_bilinear(
+ image, [h_new, w_new], align_corners=False)
+ else:
+ image_resized = cv.resize(
+ image, (w_new, h_new), interpolation=cv.INTER_LINEAR)
+ return self.crop_to_output(image_resized)
+
+
+class Affine(Transform):
+ """Affine transformation."""
+
+ def __init__(self, transform_matrix, output_sz=None, shift=None):
+ super().__init__(output_sz, shift)
+ self.transform_matrix = transform_matrix
+
+ def __call__(self, image, crop=True):
+ if isinstance(image, PTensor):
+ return self.crop_to_output(
+ numpy_to_paddle(self(
+ paddle_to_numpy(image), crop=False)))
+ else:
+ warp = cv.warpAffine(
+ image,
+ self.transform_matrix,
+ image.shape[1::-1],
+ borderMode=cv.BORDER_REPLICATE)
+ if crop:
+ return self.crop_to_output(warp)
+ else:
+ return warp
+
+
+class Rotate(Transform):
+ """Rotate with given angle."""
+
+ def __init__(self, angle, output_sz=None, shift=None):
+ super().__init__(output_sz, shift)
+ self.angle = math.pi * angle / 180
+
+ def __call__(self, image, crop=True):
+ if isinstance(image, PTensor):
+ return self.crop_to_output(
+ numpy_to_paddle(self(
+ paddle_to_numpy(image), crop=False)))
+ else:
+ c = (np.expand_dims(np.array(image.shape[:2]), 1) - 1) / 2
+ R = np.array([[math.cos(self.angle), math.sin(self.angle)],
+ [-math.sin(self.angle), math.cos(self.angle)]])
+ H = np.concatenate([R, c - R @c], 1)
+ warp = cv.warpAffine(
+ image, H, image.shape[1::-1], borderMode=cv.BORDER_REPLICATE)
+ if crop:
+ return self.crop_to_output(warp)
+ else:
+ return warp
+
+
+class Blur(Transform):
+ """Blur with given sigma (can be axis dependent)."""
+
+ def __init__(self, sigma, output_sz=None, shift=None):
+ super().__init__(output_sz, shift)
+ if isinstance(sigma, (float, int)):
+ sigma = (sigma, sigma)
+ self.sigma = sigma
+ self.filter_size = [math.ceil(2 * s) for s in self.sigma]
+
+ x_coord = [
+ np.arange(
+ -sz, sz + 1, 1, dtype='float32') for sz in self.filter_size
+ ]
+ self.filter_np = [
+ np.exp(0 - (x * x) / (2 * s**2))
+ for x, s in zip(x_coord, self.sigma)
+ ]
+ self.filter_np[0] = np.reshape(
+ self.filter_np[0], [1, 1, -1, 1]) / np.sum(self.filter_np[0])
+ self.filter_np[1] = np.reshape(
+ self.filter_np[1], [1, 1, 1, -1]) / np.sum(self.filter_np[1])
+
+ def __call__(self, image):
+ if isinstance(image, PTensor):
+ sz = image.shape[2:]
+ filter = [n2p(f) for f in self.filter_np]
+ im1 = Fconv2d(
+ layers.reshape(image, [-1, 1, sz[0], sz[1]]),
+ filter[0],
+ padding=(self.filter_size[0], 0))
+ return self.crop_to_output(
+ layers.reshape(
+ Fconv2d(
+ im1, filter[1], padding=(0, self.filter_size[1])),
+ [1, -1, sz[0], sz[1]]))
+ else:
+ return paddle_to_numpy(self(numpy_to_paddle(image)))
diff --git a/PaddleCV/tracking/pytracking/features/color.py b/PaddleCV/tracking/pytracking/features/color.py
new file mode 100644
index 0000000000000000000000000000000000000000..969621011d1f21f52ce8b88b3567ddd32d2d9f2e
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/features/color.py
@@ -0,0 +1,30 @@
+from paddle.fluid import layers
+from pytracking.features.featurebase import FeatureBase
+from pytracking.libs.paddle_utils import PTensor
+import numpy as np
+
+
+class RGB(FeatureBase):
+ """RGB feature normalized to [-0.5, 0.5]."""
+
+ def dim(self):
+ return 3
+
+ def stride(self):
+ return self.pool_stride
+
+ def extract(self, im: np.ndarray):
+ return im / 255 - 0.5
+
+
+class Grayscale(FeatureBase):
+ """Grayscale feature normalized to [-0.5, 0.5]."""
+
+ def dim(self):
+ return 1
+
+ def stride(self):
+ return self.pool_stride
+
+ def extract(self, im: np.ndarray):
+ return np.mean(im / 255 - 0.5, 1, keepdims=True)
diff --git a/PaddleCV/tracking/pytracking/features/deep.py b/PaddleCV/tracking/pytracking/features/deep.py
new file mode 100644
index 0000000000000000000000000000000000000000..376acf6d07b7390a94fb3d9ae830f245d0b0ee27
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/features/deep.py
@@ -0,0 +1,349 @@
+import os
+
+import numpy as np
+from paddle import fluid
+
+from ltr.models.bbreg.atom import atom_resnet50, atom_resnet18
+from ltr.models.siamese.siam import siamfc_alexnet
+from pytracking.admin.environment import env_settings
+from pytracking.features.featurebase import MultiFeatureBase
+from pytracking.libs import TensorList
+from pytracking.libs.paddle_utils import n2p
+
+
+class ResNet18(MultiFeatureBase):
+ """ResNet18 feature.
+ args:
+ output_layers: List of layers to output.
+ net_path: Relative or absolute net path (default should be fine).
+ use_gpu: Use GPU or CPU.
+ """
+
+ def __init__(self,
+ output_layers=('block2', ),
+ net_path='atom_iou',
+ use_gpu=True,
+ *args,
+ **kwargs):
+ super().__init__(*args, **kwargs)
+
+ self.output_layers = list(output_layers)
+ self.use_gpu = use_gpu
+ self.net_path = net_path
+
+ def initialize(self):
+ with fluid.dygraph.guard():
+ if os.path.isabs(self.net_path):
+ net_path_full = self.net_path
+ else:
+ net_path_full = os.path.join(env_settings().network_path,
+ self.net_path)
+
+ self.net = atom_resnet18(
+ backbone_pretrained=False,
+ backbone_is_test=True,
+ iounet_is_test=True)
+
+ state_dictsm, _ = fluid.load_dygraph(net_path_full)
+ self.net.load_dict(state_dictsm)
+ self.net.train()
+
+ self.iou_predictor = self.net.bb_regressor
+
+ self.layer_stride = {
+ 'conv0': 2,
+ 'conv1': 2,
+ 'block0': 4,
+ 'block1': 8,
+ 'block2': 16,
+ 'block3': 32,
+ 'classification': 16,
+ 'fc': None
+ }
+ self.layer_dim = {
+ 'conv0': 64,
+ 'conv1': 64,
+ 'block0': 64,
+ 'block1': 128,
+ 'block2': 256,
+ 'block3': 512,
+ 'classification': 256,
+ 'fc': None
+ }
+
+ self.iounet_feature_layers = self.net.bb_regressor_layer
+
+ if isinstance(self.pool_stride, int) and self.pool_stride == 1:
+ self.pool_stride = [1] * len(self.output_layers)
+
+ self.feature_layers = sorted(
+ list(set(self.output_layers + self.iounet_feature_layers)))
+
+ self.mean = np.reshape([0.485, 0.456, 0.406], [1, -1, 1, 1])
+ self.std = np.reshape([0.229, 0.224, 0.225], [1, -1, 1, 1])
+
+ def free_memory(self):
+ if hasattr(self, 'net'):
+ del self.net
+ if hasattr(self, 'iou_predictor'):
+ del self.iou_predictor
+ if hasattr(self, 'iounet_backbone_features'):
+ del self.iounet_backbone_features
+ if hasattr(self, 'iounet_features'):
+ del self.iounet_features
+
+ def dim(self):
+ return TensorList([self.layer_dim[l] for l in self.output_layers])
+
+ def stride(self):
+ return TensorList([
+ s * self.layer_stride[l]
+ for l, s in zip(self.output_layers, self.pool_stride)
+ ])
+
+ def extract(self, im: np.ndarray, debug_save_name=None):
+ with fluid.dygraph.guard():
+ if debug_save_name is not None:
+ np.savez(debug_save_name, im)
+
+ im = im / 255. # don't use im /= 255. since we don't want to alter the input
+ im -= self.mean
+ im /= self.std
+ im = n2p(im)
+
+ output_features = self.net.extract_features(im, self.feature_layers)
+
+ # Store the raw resnet features which are input to iounet
+ iounet_backbone_features = TensorList([
+ output_features[layer] for layer in self.iounet_feature_layers
+ ])
+ self.iounet_backbone_features = iounet_backbone_features.numpy()
+
+ # Store the processed features from iounet, just before pooling
+ self.iounet_features = TensorList([
+ f.numpy()
+ for f in self.iou_predictor.get_iou_feat(
+ iounet_backbone_features)
+ ])
+
+ output = TensorList([
+ output_features[layer].numpy() for layer in self.output_layers
+ ])
+ return output
+
+
+class ResNet50(MultiFeatureBase):
+ """ResNet50 feature.
+ args:
+ output_layers: List of layers to output.
+ net_path: Relative or absolute net path (default should be fine).
+ use_gpu: Use GPU or CPU.
+ """
+
+ def __init__(self,
+ output_layers=('block2', ),
+ net_path='atom_iou',
+ use_gpu=True,
+ *args,
+ **kwargs):
+ super().__init__(*args, **kwargs)
+
+ self.output_layers = list(output_layers)
+ self.use_gpu = use_gpu
+ self.net_path = net_path
+
+ def initialize(self):
+ with fluid.dygraph.guard():
+ if os.path.isabs(self.net_path):
+ net_path_full = self.net_path
+ else:
+ net_path_full = os.path.join(env_settings().network_path,
+ self.net_path)
+
+ self.net = atom_resnet50(
+ backbone_pretrained=False,
+ backbone_is_test=True,
+ iounet_is_test=True)
+
+ state_dictsm, _ = fluid.load_dygraph(net_path_full)
+ self.net.load_dict(state_dictsm)
+ self.net.train()
+
+ self.iou_predictor = self.net.bb_regressor
+
+ self.layer_stride = {
+ 'conv0': 2,
+ 'conv1': 2,
+ 'block0': 4,
+ 'block1': 8,
+ 'block2': 16,
+ 'block3': 32,
+ 'classification': 16,
+ 'fc': None
+ }
+ self.layer_dim = {
+ 'conv0': 64,
+ 'conv1': 64,
+ 'block0': 256,
+ 'block1': 512,
+ 'block2': 1024,
+ 'block3': 2048,
+ 'classification': 256,
+ 'fc': None
+ }
+
+ self.iounet_feature_layers = self.net.bb_regressor_layer
+
+ if isinstance(self.pool_stride, int) and self.pool_stride == 1:
+ self.pool_stride = [1] * len(self.output_layers)
+
+ self.feature_layers = sorted(
+ list(set(self.output_layers + self.iounet_feature_layers)))
+
+ self.mean = np.reshape([0.485, 0.456, 0.406], [1, -1, 1, 1])
+ self.std = np.reshape([0.229, 0.224, 0.225], [1, -1, 1, 1])
+
+ def free_memory(self):
+ if hasattr(self, 'net'):
+ del self.net
+ if hasattr(self, 'iou_predictor'):
+ del self.iou_predictor
+ if hasattr(self, 'iounet_backbone_features'):
+ del self.iounet_backbone_features
+ if hasattr(self, 'iounet_features'):
+ del self.iounet_features
+
+ def dim(self):
+ return TensorList([self.layer_dim[l] for l in self.output_layers])
+
+ def stride(self):
+ return TensorList([
+ s * self.layer_stride[l]
+ for l, s in zip(self.output_layers, self.pool_stride)
+ ])
+
+ def extract(self, im: np.ndarray, debug_save_name=None):
+ with fluid.dygraph.guard():
+ if debug_save_name is not None:
+ np.savez(debug_save_name, im)
+
+ im = im / 255. # don't use im /= 255. since we don't want to alter the input
+ im -= self.mean
+ im /= self.std
+ im = n2p(im)
+
+ output_features = self.net.extract_features(im, self.feature_layers)
+
+ # Store the raw resnet features which are input to iounet
+ iounet_backbone_features = TensorList([
+ output_features[layer] for layer in self.iounet_feature_layers
+ ])
+ self.iounet_backbone_features = iounet_backbone_features.numpy()
+
+ # Store the processed features from iounet, just before pooling
+ self.iounet_features = TensorList([
+ f.numpy()
+ for f in self.iou_predictor.get_iou_feat(
+ iounet_backbone_features)
+ ])
+
+ output = TensorList([
+ output_features[layer].numpy() for layer in self.output_layers
+ ])
+ return output
+
+
+class SFCAlexnet(MultiFeatureBase):
+ """Alexnet feature.
+ args:
+ output_layers: List of layers to output.
+ net_path: Relative or absolute net path (default should be fine).
+ use_gpu: Use GPU or CPU.
+ """
+
+ def __init__(self,
+ output_layers=('conv5', ),
+ net_path='estimator',
+ use_gpu=True,
+ *args,
+ **kwargs):
+ super().__init__(*args, **kwargs)
+
+ self.output_layers = list(output_layers)
+ self.use_gpu = use_gpu
+ self.net_path = net_path
+
+ def initialize(self):
+ with fluid.dygraph.guard():
+ if os.path.isabs(self.net_path):
+ net_path_full = self.net_path
+ else:
+ net_path_full = os.path.join(env_settings().network_path,
+ self.net_path)
+
+ self.net = siamfc_alexnet(
+ backbone_pretrained=False,
+ backbone_is_test=True,
+ estimator_is_test=True)
+
+ state_dictsm, _ = fluid.load_dygraph(net_path_full)
+ self.net.load_dict(state_dictsm)
+ self.net.train()
+
+ self.target_estimator = self.net.target_estimator
+
+ self.layer_stride = {'conv5': 8}
+ self.layer_dim = {'conv5': 256}
+
+ self.estimator_feature_layers = self.net.target_estimator_layer
+
+ if isinstance(self.pool_stride, int) and self.pool_stride == 1:
+ self.pool_stride = [1] * len(self.output_layers)
+
+ self.feature_layers = sorted(
+ list(set(self.output_layers + self.estimator_feature_layers)))
+
+ self.mean = np.reshape([0., 0., 0.], [1, -1, 1, 1])
+ self.std = np.reshape([1 / 255., 1 / 255., 1 / 255.], [1, -1, 1, 1])
+
+ def free_memory(self):
+ if hasattr(self, 'net'):
+ del self.net
+ if hasattr(self, 'target_estimator'):
+ del self.target_estimator
+ if hasattr(self, 'estimator_backbone_features'):
+ del self.estimator_backbone_features
+
+ def dim(self):
+ return TensorList([self.layer_dim[l] for l in self.output_layers])
+
+ def stride(self):
+ return TensorList([
+ s * self.layer_stride[l]
+ for l, s in zip(self.output_layers, self.pool_stride)
+ ])
+
+ def extract(self, im: np.ndarray, debug_save_name=None):
+ with fluid.dygraph.guard():
+ if debug_save_name is not None:
+ np.savez(debug_save_name, im)
+
+ im = im / 255. # don't use im /= 255. since we don't want to alter the input
+ im -= self.mean
+ im /= self.std
+ im = n2p(im)
+
+ output_features = self.net.extract_features(im, self.feature_layers)
+
+ # Store the raw backbone features which are input to estimator
+ estimator_backbone_features = TensorList([
+ output_features[layer]
+ for layer in self.estimator_feature_layers
+ ])
+ self.estimator_backbone_features = estimator_backbone_features.numpy(
+ )
+
+ output = TensorList([
+ output_features[layer].numpy() for layer in self.output_layers
+ ])
+ return output
diff --git a/PaddleCV/tracking/pytracking/features/extractor.py b/PaddleCV/tracking/pytracking/features/extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..025c915491a6f38dde81622a64accb5e28c27dfc
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/features/extractor.py
@@ -0,0 +1,185 @@
+import numpy as np
+from paddle import fluid
+from paddle.fluid import layers
+from pytracking.features.preprocessing import sample_patch
+from pytracking.libs import TensorList
+
+
+class ExtractorBase:
+ """Base feature extractor class.
+ args:
+ features: List of features.
+ """
+
+ def __init__(self, features):
+ self.features = features
+
+ def initialize(self):
+ for f in self.features:
+ f.initialize()
+
+ def free_memory(self):
+ for f in self.features:
+ f.free_memory()
+
+
+class SingleResolutionExtractor(ExtractorBase):
+ """Single resolution feature extractor.
+ args:
+ features: List of features.
+ """
+
+ def __init__(self, features):
+ super().__init__(features)
+
+ self.feature_stride = self.features[0].stride()
+ if isinstance(self.feature_stride, (list, TensorList)):
+ self.feature_stride = self.feature_stride[0]
+
+ def stride(self):
+ return self.feature_stride
+
+ def size(self, input_sz):
+ return input_sz // self.stride()
+
+ def extract(self, im, pos, scales, image_sz):
+ if isinstance(scales, (int, float)):
+ scales = [scales]
+
+ # Get image patches
+ im_patches = np.stack(
+ [sample_patch(im, pos, s * image_sz, image_sz) for s in scales])
+ im_patches = np.transpose(im_patches, (0, 3, 1, 2))
+
+ # Compute features
+ feature_map = layers.concat(
+ TensorList(
+ [f.get_feature(im_patches) for f in self.features]).unroll(),
+ axis=1)
+
+ return feature_map
+
+
+class MultiResolutionExtractor(ExtractorBase):
+ """Multi-resolution feature extractor.
+ args:
+ features: List of features.
+ """
+
+ def __init__(self, features):
+ super().__init__(features)
+ self.is_color = None
+
+ def stride(self):
+ return TensorList(
+ [f.stride() for f in self.features
+ if self._return_feature(f)]).unroll()
+
+ def size(self, input_sz):
+ return TensorList([
+ f.size(input_sz) for f in self.features if self._return_feature(f)
+ ]).unroll()
+
+ def dim(self):
+ return TensorList(
+ [f.dim() for f in self.features
+ if self._return_feature(f)]).unroll()
+
+ def get_fparams(self, name: str=None):
+ if name is None:
+ return [f.fparams for f in self.features if self._return_feature(f)]
+ return TensorList([
+ getattr(f.fparams, name) for f in self.features
+ if self._return_feature(f)
+ ]).unroll()
+
+ def get_attribute(self, name: str, ignore_missing: bool=False):
+ if ignore_missing:
+ return TensorList([
+ getattr(f, name) for f in self.features
+ if self._return_feature(f) and hasattr(f, name)
+ ])
+ else:
+ return TensorList([
+ getattr(f, name, None) for f in self.features
+ if self._return_feature(f)
+ ])
+
+ def get_unique_attribute(self, name: str):
+ feat = None
+ for f in self.features:
+ if self._return_feature(f) and hasattr(f, name):
+ if feat is not None:
+ raise RuntimeError('The attribute was not unique.')
+ feat = f
+ if feat is None:
+ raise RuntimeError('The attribute did not exist')
+ return getattr(feat, name)
+
+ def _return_feature(self, f):
+ return self.is_color is None or self.is_color and f.use_for_color or not self.is_color and f.use_for_gray
+
+ def set_is_color(self, is_color: bool):
+ self.is_color = is_color
+
+ def extract(self, im, pos, scales, image_sz, debug_save_name=None):
+ """Extract features.
+ args:
+ im: Image.
+ pos: Center position for extraction.
+ scales: Image scales to extract features from.
+ image_sz: Size to resize the image samples to before extraction.
+ """
+ if isinstance(scales, (int, float)):
+ scales = [scales]
+
+ # Get image patches
+ with fluid.dygraph.guard(fluid.CPUPlace()):
+ im_patches = np.stack([
+ sample_patch(im, pos, s * image_sz, image_sz) for s in scales
+ ])
+
+ if debug_save_name is not None:
+ np.save(debug_save_name, im_patches)
+
+ im_patches = np.transpose(im_patches, (0, 3, 1, 2))
+
+ # Compute features
+ feature_map = TensorList(
+ [f.get_feature(im_patches) for f in self.features]).unroll()
+
+ return feature_map
+
+ def extract_transformed(self,
+ im,
+ pos,
+ scale,
+ image_sz,
+ transforms,
+ debug_save_name=None):
+ """Extract features from a set of transformed image samples.
+ args:
+ im: Image.
+ pos: Center position for extraction.
+ scale: Image scale to extract features from.
+ image_sz: Size to resize the image samples to before extraction.
+ transforms: A set of image transforms to apply.
+ """
+
+ # Get image patche
+ im_patch = sample_patch(im, pos, scale * image_sz, image_sz)
+
+ # Apply transforms
+ with fluid.dygraph.guard(fluid.CPUPlace()):
+ im_patches = np.stack([T(im_patch) for T in transforms])
+
+ if debug_save_name is not None:
+ np.save(debug_save_name, im_patches)
+
+ im_patches = np.transpose(im_patches, (0, 3, 1, 2))
+
+ # Compute features
+ feature_map = TensorList(
+ [f.get_feature(im_patches) for f in self.features]).unroll()
+
+ return feature_map
diff --git a/PaddleCV/tracking/pytracking/features/featurebase.py b/PaddleCV/tracking/pytracking/features/featurebase.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9ca9cd1fba0ee0009d13ed6c498e416cf3dedf9
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/features/featurebase.py
@@ -0,0 +1,158 @@
+from paddle import fluid
+from paddle.fluid import layers
+from pytracking.libs import TensorList
+from pytracking.libs.paddle_utils import floordiv, n2p, broadcast_op
+
+import numpy as np
+
+
+class FeatureBase:
+ """Base feature class.
+ args:
+ fparams: Feature specific parameters.
+ pool_stride: Amount of average pooling to apply do downsample the feature map.
+ output_size: Alternatively, specify the output size of the feature map. Adaptive average pooling will be applied.
+ normalize_power: The power exponent for the normalization. None means no normalization (default).
+ use_for_color: Use this feature for color images.
+ use_for_gray: Use this feature for grayscale images.
+ """
+
+ def __init__(self,
+ fparams=None,
+ pool_stride=None,
+ output_size=None,
+ normalize_power=None,
+ use_for_color=True,
+ use_for_gray=True):
+ self.fparams = fparams
+ self.pool_stride = 1 if pool_stride is None else pool_stride
+ self.output_size = output_size
+ self.normalize_power = normalize_power
+ self.use_for_color = use_for_color
+ self.use_for_gray = use_for_gray
+
+ def initialize(self):
+ pass
+
+ def free_memory(self):
+ pass
+
+ def dim(self):
+ raise NotImplementedError
+
+ def stride(self):
+ raise NotImplementedError
+
+ def size(self, im_sz):
+ if self.output_size is None:
+ return floordiv(im_sz, self.stride())
+ return self.output_size
+
+ def extract(self, im):
+ """Performs feature extraction."""
+ raise NotImplementedError
+
+ def get_feature(self, im: np.ndarray):
+ """Get the feature. Generally, call this function.
+ args:
+ im: image patch
+ """
+
+ # Return empty tensor if it should not be used
+ is_color = im.shape[1] == 3
+ if is_color and not self.use_for_color or not is_color and not self.use_for_gray:
+ return np.array([])
+
+ # Extract feature
+ feat = self.extract(im)
+
+ # Pool/downsample
+ with fluid.dygraph.guard():
+ feat = n2p(feat)
+
+ if self.output_size is not None:
+ feat = layers.adaptive_pool2d(feat, self.output_size, 'avg')
+ elif self.pool_stride != 1:
+ feat = layers.pool2d(
+ feat,
+ self.pool_stride,
+ pool_stride=self.pool_stride,
+ pool_type='avg')
+
+ # Normalize
+ if self.normalize_power is not None:
+ feat /= (
+ layers.reduce_sum(
+ layers.reshape(
+ layers.abs(feat), [feat.shape[0], 1, 1, -1])**
+ self.normalize_power,
+ dim=3,
+ keep_dim=True) /
+ (feat.shape[1] * feat.shape[2] * feat.shape[3]) + 1e-10)**(
+ 1 / self.normalize_power)
+
+ feat = feat.numpy()
+ return feat
+
+
+class MultiFeatureBase(FeatureBase):
+ """Base class for features potentially having multiple feature blocks as output (like CNNs).
+ See FeatureBase for more info.
+ """
+
+ def size(self, im_sz):
+ if self.output_size is None:
+ return TensorList([floordiv(im_sz, s) for s in self.stride()])
+ if isinstance(im_sz, PTensor):
+ return TensorList([
+ floordiv(im_sz, s) if sz is None else np.array([sz[0], sz[1]])
+ for sz, s in zip(self.output_size, self.stride())
+ ])
+
+ def get_feature(self, im: np.ndarray):
+ """Get the feature. Generally, call this function.
+ args:
+ im: image patch
+ """
+
+ # Return empty tensor if it should not be used
+ is_color = im.shape[1] == 3
+ if is_color and not self.use_for_color or not is_color and not self.use_for_gray:
+ return np.array([])
+
+ feat_list = self.extract(im)
+
+ output_sz = [None] * len(
+ feat_list) if self.output_size is None else self.output_size
+
+ # Pool/downsample
+ with fluid.dygraph.guard():
+ feat_list = [n2p(f) for f in feat_list]
+
+ for i, (sz, s) in enumerate(zip(output_sz, self.pool_stride)):
+ if sz is not None:
+ feat_list[i] = layers.adaptive_pool2d(
+ feat_list[i], sz, pool_type='avg')
+ elif s != 1:
+ feat_list[i] = layers.pool2d(
+ feat_list[i], s, pool_stride=s, pool_type='avg')
+
+ # Normalize
+ if self.normalize_power is not None:
+ new_feat_list = []
+ for feat in feat_list:
+ norm = (layers.reduce_sum(
+ layers.reshape(
+ layers.abs(feat), [feat.shape[0], 1, 1, -1])**
+ self.normalize_power,
+ dim=3,
+ keep_dim=True) /
+ (feat.shape[1] * feat.shape[2] * feat.shape[3]
+ ) + 1e-10)**(1 / self.normalize_power)
+ feat = broadcast_op(feat, norm, 'div')
+ new_feat_list.append(feat)
+ feat_list = new_feat_list
+
+ # To numpy
+ feat_list = TensorList([f.numpy() for f in feat_list])
+ return feat_list
diff --git a/PaddleCV/tracking/pytracking/features/preprocessing.py b/PaddleCV/tracking/pytracking/features/preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..280f40e2753719c13d6635eaf78296513a602de4
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/features/preprocessing.py
@@ -0,0 +1,120 @@
+import numpy as np
+import cv2 as cv
+from paddle.fluid import dygraph
+from paddle.fluid import layers
+from pytracking.libs.paddle_utils import PTensor, n2p, _padding, squeeze, unsqueeze
+
+
+def numpy_to_paddle(a: np.ndarray):
+ return unsqueeze(
+ layers.transpose(
+ layers.cast(dygraph.to_variable(a), 'float32'), [2, 0, 1]), [0])
+
+
+def paddle_to_numpy(a: PTensor):
+ return layers.transpose(squeeze(a, [0]), [1, 2, 0]).numpy()
+
+
+def sample_patch(im: np.ndarray,
+ pos: np.ndarray,
+ sample_sz: np.ndarray,
+ output_sz: np.ndarray=None):
+ """Sample an image patch.
+
+ args:
+ im: Image
+ pos: center position of crop
+ sample_sz: size to crop
+ output_sz: size to resize to
+ """
+
+ # copy and convert
+ posl = pos.astype('long')
+
+ # Compute pre-downsampling factor
+ if output_sz is not None:
+ resize_factor = np.min(
+ sample_sz.astype('float32') / output_sz.astype('float32'))
+ df = int(max(int(resize_factor - 0.1), 1))
+ else:
+ df = int(1)
+
+ sz = sample_sz.astype('float32') / df # new size
+
+ # Do downsampling
+ if df > 1:
+ os = posl % df # offset
+ posl = ((posl - os) / df).astype('long') # new position
+ im2 = im[os[0]::df, os[1]::df] # downsample
+ else:
+ im2 = im
+
+ # compute size to crop
+ szl = np.maximum(
+ np.round(sz), np.array(
+ [2., 2.], dtype='float32')).astype('long')
+
+ # Extract top and bottom coordinates
+ tl = posl - (szl - 1) // 2
+ br = posl + szl // 2
+
+ # Get image patch
+ im_patch = _padding(
+ im2, (0, 0, -tl[1], br[1] - im2.shape[1] + 1, -tl[0],
+ br[0] - im2.shape[0] + 1),
+ mode='replicate')
+
+ if output_sz is None or (im_patch.shape[0] == output_sz[0] and
+ im_patch.shape[1] == output_sz[1]):
+ return im_patch
+
+ # Resample
+ osz = output_sz.astype('long')
+ im_patch = cv.resize(
+ im_patch, (osz[1], osz[0]), interpolation=cv.INTER_LINEAR)
+ return im_patch
+
+
+def sample_patch_with_mean_pad(im: np.ndarray,
+ pos: np.ndarray,
+ sample_sz: np.ndarray,
+ output_sz: np.ndarray=None):
+ """Sample an image patch.
+
+ args:
+ im: Image
+ pos: center position of crop
+ sample_sz: size to crop
+ output_sz: size to resize to
+ """
+
+ # copy and convert
+ # posl = np.round(pos).astype('long') # TODO: maybe we should use round
+ posl = pos.astype('long')
+
+ im2 = im
+ sz = sample_sz.astype('float32')
+ # compute size to crop
+ szl = np.maximum(
+ np.round(sz), np.array(
+ [2., 2.], dtype='float32')).astype('long')
+
+ # Extract top and bottom coordinates
+ tl = posl - (szl - 1) // 2
+ br = posl + szl // 2
+
+ # Get image patch
+ im_patch = _padding(
+ im2, (0, 0, -tl[1], br[1] - im2.shape[1] + 1, -tl[0],
+ br[0] - im2.shape[0] + 1),
+ mode='replicate')
+
+ if output_sz is None or (im_patch.shape[0] == output_sz[0] and
+ im_patch.shape[1] == output_sz[1]):
+ return im_patch
+
+ # Resample
+ osz = output_sz.astype('long')
+ im_patch = cv.resize(
+ im_patch, (osz[1], osz[0]), interpolation=cv.INTER_LINEAR)
+ return im_patch
diff --git a/PaddleCV/tracking/pytracking/libs/Fconv2d.py b/PaddleCV/tracking/pytracking/libs/Fconv2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d926b53934ac4b5b91f1d3859f00afffde8d7a2
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/libs/Fconv2d.py
@@ -0,0 +1,259 @@
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+
+from paddle.fluid import core
+
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.dygraph.layer_object_helper import LayerObjectHelper
+
+from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
+
+from paddle.fluid.param_attr import ParamAttr
+
+from paddle.fluid.framework import Variable, OpProtoHolder, in_dygraph_mode
+from paddle.fluid.layers import utils
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+
+from paddle.fluid import core
+
+from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
+
+from paddle.fluid.dygraph import dygraph_utils
+
+from paddle.fluid.framework import Variable, OpProtoHolder, in_dygraph_mode
+from paddle.fluid.layers import utils
+
+
+def Fconv2d(
+ input,
+ filter,
+ stride=1,
+ padding=0,
+ dilation=1,
+ groups=1,
+ use_cudnn=True, ):
+ """
+ Similar with conv2d, this is a convolution2D layers. Difference
+ is filter can be token as input directly instead of setting filter size
+ and number of fliters. Filter is a 4-D tensor with shape
+ [num_filter, num_channel, filter_size_h, filter_size_w].
+ Args:
+ input (Variable): The input image with [N, C, H, W] format.
+ filter(Variable): The input filter with [out_channels, in_channels, H, W] format.
+ stride (int|tuple): The stride size. If stride is a tuple, it must
+ contain two integers, (stride_H, stride_W). Otherwise, the
+ stride_H = stride_W = stride. Default: stride = 1.
+ padding (int|tuple): The padding size. If padding is a tuple, it must
+ contain two integers, (padding_H, padding_W). Otherwise, the
+ padding_H = padding_W = padding. Default: padding = 0.
+ dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+ contain two integers, (dilation_H, dilation_W). Otherwise, the
+ dilation_H = dilation_W = dilation. Default: dilation = 1.
+ bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
+ If it is set to False, no bias will be added to the output units.
+ If it is set to None or one attribute of ParamAttr, conv2d
+ will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+ is not set, the bias is initialized zero. Default: None.
+ use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+ library is installed. Default: True
+ act (str): Activation type, if it is set to None, activation is not appended.
+ Default: None
+ name (str|None): A name for this layer(optional). If set None, the layer
+ will be named automatically. Default: None
+ Returns:
+ Variable: The tensor variable storing the convolution and \
+ non-linearity activation result.
+ Raises:
+ ValueError: If the shapes of input, filter_size, stride, padding and
+ groups mismatch.
+ Examples:
+ .. code-block:: python
+ data = fluid.layers.data(name='data', shape=[3, 32, 32], \
+ dtype='float32')
+ filter = fluid.layers.data(name='filter',shape=[10,3,3,3], \
+ dtype='float32',append_batch_size=False)
+ conv2d = fluid.layers.conv2d(input=data,
+ filter=filter,
+ act="relu")
+ """
+ conv_with_filter = Conv2D(
+ stride=stride, padding=padding, dilation=dilation, groups=groups)
+ return conv_with_filter(input, filter)
+
+
+class Conv2D(fluid.dygraph.layers.Layer):
+ """
+ This interface is used to construct a callable object of the ``Conv2D`` class.
+ For more details, refer to code examples.
+ The convolution2D layer calculates the output based on the input, filter
+ and strides, paddings, dilations, groups parameters. Input and
+ Output are in NCHW format, where N is batch size, C is the number of
+ the feature map, H is the height of the feature map, and W is the width of the feature map.
+ Filter's shape is [MCHW] , where M is the number of output feature map,
+ C is the number of input feature map, H is the height of the filter,
+ and W is the width of the filter. If the groups is greater than 1,
+ C will equal the number of input feature map divided by the groups.
+ Please refer to UFLDL's `convolution
+ `_
+ for more detials.
+ If bias attribution and activation type are provided, bias is added to the
+ output of the convolution, and the corresponding activation function is
+ applied to the final result.
+ For each input :math:`X`, the equation is:
+ .. math::
+ Out = \\sigma (W \\ast X + b)
+ Where:
+ * :math:`X`: Input value, a ``Tensor`` with NCHW format.
+ * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
+ * :math:`\\ast`: Convolution operation.
+ * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
+ * :math:`\\sigma`: Activation function.
+ * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+ Example:
+ - Input:
+ Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+ Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
+ - Output:
+ Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+ Where
+ .. math::
+ H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+ W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+ Parameters:
+ num_channels(int): The number of channels in the input image.
+ num_filters(int): The number of filter. It is as same as the output
+ feature map.
+ filter_size (int or tuple): The filter size. If filter_size is a tuple,
+ it must contain two integers, (filter_size_H, filter_size_W).
+ Otherwise, the filter will be a square.
+ stride (int or tuple, optional): The stride size. If stride is a tuple, it must
+ contain two integers, (stride_H, stride_W). Otherwise, the
+ stride_H = stride_W = stride. Default: 1.
+ padding (int or tuple, optional): The padding size. If padding is a tuple, it must
+ contain two integers, (padding_H, padding_W). Otherwise, the
+ padding_H = padding_W = padding. Default: 0.
+ dilation (int or tuple, optional): The dilation size. If dilation is a tuple, it must
+ contain two integers, (dilation_H, dilation_W). Otherwise, the
+ dilation_H = dilation_W = dilation. Default: 1.
+ groups (int, optional): The groups number of the Conv2d Layer. According to grouped
+ convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+ the first half of the filters is only connected to the first half
+ of the input channels, while the second half of the filters is only
+ connected to the second half of the input channels. Default: 1.
+ param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+ of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+ will create ParamAttr as param_attr. If the Initializer of the param_attr
+ is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
+ and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+ bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv2d.
+ If it is set to False, no bias will be added to the output units.
+ If it is set to None or one attribute of ParamAttr, conv2d
+ will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+ is not set, the bias is initialized zero. Default: None.
+ use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn
+ library is installed. Default: True.
+ act (str, optional): Activation type, if it is set to None, activation is not appended.
+ Default: None.
+ dtype (str, optional): Data type, it can be "float32" or "float64". Default: "float32".
+ Attribute:
+ **weight** (Parameter): the learnable weights of filter of this layer.
+ **bias** (Parameter or None): the learnable bias of this layer.
+ Returns:
+ None
+
+ Raises:
+ ValueError: if ``use_cudnn`` is not a bool value.
+ Examples:
+ .. code-block:: python
+ from paddle.fluid.dygraph.base import to_variable
+ import paddle.fluid as fluid
+ from paddle.fluid.dygraph import Conv2D
+ import numpy as np
+ data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+ with fluid.dygraph.guard():
+ conv2d = Conv2D(3, 2, 3)
+ data = to_variable(data)
+ conv = conv2d(data)
+ """
+
+ def __init__(self,
+ stride=1,
+ padding=0,
+ dilation=1,
+ groups=None,
+ use_cudnn=True,
+ act=None,
+ dtype='float32'):
+ super(Conv2D, self).__init__()
+ self._groups = groups
+ self._stride = utils.convert_to_list(stride, 2, 'stride')
+ self._padding = utils.convert_to_list(padding, 2, 'padding')
+ self._dilation = utils.convert_to_list(dilation, 2, 'dilation')
+ self._act = act
+ if not isinstance(use_cudnn, bool):
+ raise ValueError("use_cudnn should be True or False")
+ self._use_cudnn = use_cudnn
+ self._dtype = dtype
+
+ # TODO: recover the usage of depthwise_conv2d when it's
+ # kernel fixed https://github.com/PaddlePaddle/Paddle/issues/17098
+ # if (self._num_channels == self._groups and
+ # num_filters % self._num_channels == 0 and not self._use_cudnn):
+ # self._l_type = 'depthwise_conv2d'
+ # else:
+ # self._l_type = 'conv2d'
+ self._l_type = 'conv2d'
+
+ def forward(self, input, weight, bias=None):
+ inputs = {
+ 'Input': [input],
+ 'Filter': [weight],
+ }
+ attrs = {
+ 'strides': self._stride,
+ 'paddings': self._padding,
+ 'dilations': self._dilation,
+ 'groups': self._groups if self._groups else 1,
+ 'use_cudnn': self._use_cudnn,
+ 'use_mkldnn': False,
+ }
+
+ if in_dygraph_mode():
+ outs = core.ops.conv2d(inputs, attrs)
+ pre_bias = outs['Output'][0]
+
+ pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, bias, 1)
+
+ return dygraph_utils._append_activation_in_dygraph(pre_act,
+ self._act)
+
+ pre_bias = self._helper.create_variable_for_type_inference(
+ dtype=self._dtype)
+
+ self._helper.append_op(
+ type=self._l_type,
+ inputs={
+ 'Input': input,
+ 'Filter': weight,
+ },
+ outputs={"Output": pre_bias},
+ attrs=attrs)
+
+ if bias is not None:
+ pre_act = self._helper.create_variable_for_type_inference(
+ dtype=self._dtype)
+ self._helper.append_op(
+ type='elementwise_add',
+ inputs={'X': [pre_bias],
+ 'Y': [bias]},
+ outputs={'Out': [pre_act]},
+ attrs={'axis': 1})
+ else:
+ pre_act = pre_bias
+
+ # Currently, we don't support inplace in dygraph mode
+ return self._helper.append_activation(pre_act, act=self._act)
diff --git a/PaddleCV/tracking/pytracking/libs/Fconv2d_static.py b/PaddleCV/tracking/pytracking/libs/Fconv2d_static.py
new file mode 100644
index 0000000000000000000000000000000000000000..d62edb7083d67559c7c6c0304be976a163a00117
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/libs/Fconv2d_static.py
@@ -0,0 +1,173 @@
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+
+from paddle.fluid import core
+
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.dygraph.layer_object_helper import LayerObjectHelper
+
+from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
+
+from paddle.fluid.param_attr import ParamAttr
+
+from paddle.fluid.framework import Variable, OpProtoHolder, in_dygraph_mode
+from paddle.fluid.layers import utils
+import numpy as np
+
+
+def Fconv2d(input,
+ filter,
+ stride=1,
+ padding=0,
+ dilation=1,
+ groups=None,
+ use_cudnn=True,
+ name=None):
+ """
+ Similar with conv2d, this is a convolution2D layers. Difference
+ is filter can be token as input directly instead of setting filter size
+ and number of fliters. Filter is a 4-D tensor with shape
+ [num_filter, num_channel, filter_size_h, filter_size_w].
+ Args:
+ input (Variable): The input image with [N, C, H, W] format.
+ filter(Variable): The input filter with [out_channels, in_channels, H, W] format.
+ stride (int|tuple): The stride size. If stride is a tuple, it must
+ contain two integers, (stride_H, stride_W). Otherwise, the
+ stride_H = stride_W = stride. Default: stride = 1.
+ padding (int|tuple): The padding size. If padding is a tuple, it must
+ contain two integers, (padding_H, padding_W). Otherwise, the
+ padding_H = padding_W = padding. Default: padding = 0.
+ dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+ contain two integers, (dilation_H, dilation_W). Otherwise, the
+ dilation_H = dilation_W = dilation. Default: dilation = 1.
+ bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
+ If it is set to False, no bias will be added to the output units.
+ If it is set to None or one attribute of ParamAttr, conv2d
+ will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+ is not set, the bias is initialized zero. Default: None.
+ use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+ library is installed. Default: True
+ act (str): Activation type, if it is set to None, activation is not appended.
+ Default: None
+ name (str|None): A name for this layer(optional). If set None, the layer
+ will be named automatically. Default: None
+ Returns:
+ Variable: The tensor variable storing the convolution and \
+ non-linearity activation result.
+ Raises:
+ ValueError: If the shapes of input, filter_size, stride, padding and
+ groups mismatch.
+ Examples:
+ .. code-block:: python
+ data = fluid.layers.data(name='data', shape=[3, 32, 32], \
+ dtype='float32')
+ filter = fluid.layers.data(name='filter',shape=[10,3,3,3], \
+ dtype='float32',append_batch_size=False)
+ conv2d = fluid.layers.conv2d(input=data,
+ filter=filter,
+ act="relu")
+ """
+ helper = LayerHelper("conv2d_with_filter", **locals())
+ num_channels = input.shape[1]
+ num_filters = filter.shape[0]
+ num_filter_channels = filter.shape[1]
+ l_type = 'conv2d'
+ # if (num_channels == groups and
+ if (num_channels == groups and num_filters % num_channels == 0 and
+ not use_cudnn):
+ l_type = 'depthwise_conv2d'
+ if groups is None:
+ assert num_filter_channels == num_channels
+ groups = 1
+ else:
+ if num_channels % groups != 0:
+ raise ValueError("num_channels must be divisible by groups.")
+ if num_channels // groups != num_filter_channels:
+ raise ValueError("num_filter_channels must equal to num_channels\
+ divided by groups.")
+
+ stride = utils.convert_to_list(stride, 2, 'stride')
+ padding = utils.convert_to_list(padding, 2, 'padding')
+ dilation = utils.convert_to_list(dilation, 2, 'dilation')
+ if not isinstance(use_cudnn, bool):
+ raise ValueError("use_cudnn should be True or False")
+ pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype)
+ helper.append_op(
+ type=l_type,
+ inputs={
+ 'Input': input,
+ 'Filter': filter,
+ },
+ outputs={"Output": pre_bias},
+ attrs={
+ 'strides': stride,
+ 'paddings': padding,
+ 'dilations': dilation,
+ 'groups': groups,
+ 'use_cudnn': use_cudnn,
+ 'use_mkldnn': False
+ })
+
+ return pre_bias
+
+
+def test_conv2d_with_filter():
+ exemplar = np.random.random((8, 4, 6, 6)).astype(np.float32)
+ instance = np.random.random((8, 4, 22, 22)).astype(np.float32)
+
+ # fluid.layers.data(append_batch_size=)
+ use_gpu = False
+ place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+
+ train_program = fluid.Program()
+ start_program = fluid.Program()
+
+ with fluid.program_guard(train_program, start_program):
+ x = fluid.layers.data(
+ name="inst", shape=[8, 4, 22, 22], append_batch_size=False)
+ y = fluid.layers.data(
+ name="exem", shape=[8, 4, 6, 6], append_batch_size=False)
+ bias_att = fluid.ParamAttr(
+ name="bias_", initializer=fluid.initializer.ConstantInitializer(1.))
+ out = conv2d_with_filter(x, y, groups=1)
+ weight_att = fluid.ParamAttr(
+ name='weight',
+ initializer=fluid.initializer.NumpyArrayInitializer(exemplar))
+ bias_att = fluid.ParamAttr(
+ name="bias", initializer=fluid.initializer.ConstantInitializer(0.))
+ res = fluid.layers.conv2d(
+ x,
+ 8,
+ 6,
+ param_attr=weight_att,
+ bias_attr=bias_att,
+ stride=1,
+ padding=0,
+ dilation=1)
+
+ exe = fluid.Executor(place)
+ exe.run(program=fluid.default_startup_program())
+ print(out.shape)
+
+ compiled_prog = fluid.compiler.CompiledProgram(train_program)
+ out, res = exe.run(compiled_prog,
+ feed={"inst": instance,
+ "exem": exemplar},
+ fetch_list=[out.name, res.name])
+
+ print(np.sum(out - res))
+ np.testing.assert_allclose(out, res, rtol=1e-5, atol=0)
+
+ with fluid.dygraph.guard():
+ exem = fluid.dygraph.to_variable(exemplar)
+ inst = fluid.dygraph.to_variable(instance)
+
+ out = conv2d_with_filter(inst, exem, groups=1)
+
+ print(np.sum(out.numpy() - res))
+ np.testing.assert_allclose(out.numpy(), res, rtol=1e-5, atol=0)
+
+
+if __name__ == '__main__':
+ test_conv2d_with_filter()
diff --git a/PaddleCV/tracking/pytracking/libs/__init__.py b/PaddleCV/tracking/pytracking/libs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0cb37ab5fe647283c1fb035260b4681a1ac4fd6
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/libs/__init__.py
@@ -0,0 +1,2 @@
+from .tensorlist import TensorList
+from .tensordict import TensorDict
diff --git a/PaddleCV/tracking/pytracking/libs/complex.py b/PaddleCV/tracking/pytracking/libs/complex.py
new file mode 100644
index 0000000000000000000000000000000000000000..1de88ef856eb7478dc27e08aa3c2bf392891e9b3
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/libs/complex.py
@@ -0,0 +1,212 @@
+import numpy as np
+from pytracking.libs.tensorlist import tensor_operation
+
+
+def is_complex(a: np.array) -> bool:
+ return a.ndim >= 4 and a.shape[-1] == 2
+
+
+def is_real(a: np.array) -> bool:
+ return not is_complex(a)
+
+
+@tensor_operation
+def mult(a: np.array, b: np.array):
+ """Pointwise complex multiplication of complex tensors."""
+
+ if is_real(a):
+ if a.ndim >= b.ndim:
+ raise ValueError('Incorrect dimensions.')
+ # a is real
+ return mult_real_cplx(a, b)
+ if is_real(b):
+ if b.ndim >= a.ndim:
+ raise ValueError('Incorrect dimensions.')
+ # b is real
+ return mult_real_cplx(b, a)
+
+ # Both complex
+ c = mult_real_cplx(a[..., 0], b)
+ c[..., 0] -= a[..., 1] * b[..., 1]
+ c[..., 1] += a[..., 1] * b[..., 0]
+ return c
+
+
+@tensor_operation
+def mult_conj(a: np.array, b: np.array):
+ """Pointwise complex multiplication of complex tensors, with conjugate on b: a*conj(b)."""
+
+ if is_real(a):
+ if a.ndim >= b.ndim:
+ raise ValueError('Incorrect dimensions.')
+ # a is real
+ return mult_real_cplx(a, conj(b))
+ if is_real(b):
+ if b.ndim >= a.ndim:
+ raise ValueError('Incorrect dimensions.')
+ # b is real
+ return mult_real_cplx(b, a)
+
+ # Both complex
+ c = mult_real_cplx(b[..., 0], a)
+ c[..., 0] += a[..., 1] * b[..., 1]
+ c[..., 1] -= a[..., 0] * b[..., 1]
+ return c
+
+
+@tensor_operation
+def mult_real_cplx(a: np.array, b: np.array):
+ """Pointwise complex multiplication of real tensor a with complex tensor b."""
+
+ if is_real(b):
+ raise ValueError('Last dimension must have length 2.')
+
+ return np.expand_dims(a, -1) * b
+
+
+@tensor_operation
+def div(a: np.array, b: np.array):
+ """Pointwise complex division of complex tensors."""
+
+ if is_real(b):
+ if b.ndim >= a.ndim:
+ raise ValueError('Incorrect dimensions.')
+ # b is real
+ return div_cplx_real(a, b)
+
+ return div_cplx_real(mult_conj(a, b), abs_sqr(b))
+
+
+@tensor_operation
+def div_cplx_real(a: np.array, b: np.array):
+ """Pointwise complex division of complex tensor a with real tensor b."""
+
+ if is_real(a):
+ raise ValueError('Last dimension must have length 2.')
+
+ return a / np.expand_dims(b, -1)
+
+
+@tensor_operation
+def abs_sqr(a: np.array):
+ """Squared absolute value."""
+
+ if is_real(a):
+ raise ValueError('Last dimension must have length 2.')
+
+ return np.sum(a * a, -1)
+
+
+@tensor_operation
+def abs(a: np.array):
+ """Absolute value."""
+
+ if is_real(a):
+ raise ValueError('Last dimension must have length 2.')
+
+ return np.sqrt(abs_sqr(a))
+
+
+@tensor_operation
+def conj(a: np.array):
+ """Complex conjugate."""
+
+ if is_real(a):
+ raise ValueError('Last dimension must have length 2.')
+
+ # return a * np.array([1, -1], device=a.device)
+ return complex(a[..., 0], -a[..., 1])
+
+
+@tensor_operation
+def real(a: np.array):
+ """Real part."""
+
+ if is_real(a):
+ raise ValueError('Last dimension must have length 2.')
+
+ return a[..., 0]
+
+
+@tensor_operation
+def imag(a: np.array):
+ """Imaginary part."""
+
+ if is_real(a):
+ raise ValueError('Last dimension must have length 2.')
+
+ return a[..., 1]
+
+
+@tensor_operation
+def complex(a: np.array, b: np.array=None):
+ """Create complex tensor from real and imaginary part."""
+
+ if b is None:
+ b = np.zeros(a.shape, a.dtype)
+ elif a is None:
+ a = np.zeros(b.shape, b.dtype)
+
+ return np.concatenate((np.expand_dims(a, -1), np.expand_dims(b, -1)), -1)
+
+
+@tensor_operation
+def mtimes(a: np.array, b: np.array, conj_a=False, conj_b=False):
+ """Complex matrix multiplication of complex tensors.
+ The dimensions (-3, -2) are matrix multiplied. -1 is the complex dimension."""
+
+ if is_real(a):
+ if a.ndim >= b.ndim:
+ raise ValueError('Incorrect dimensions.')
+ return mtimes_real_complex(a, b, conj_b=conj_b)
+ if is_real(b):
+ if b.ndim >= a.ndim:
+ raise ValueError('Incorrect dimensions.')
+ return mtimes_complex_real(a, b, conj_a=conj_a)
+
+ if not conj_a and not conj_b:
+ return complex(
+ np.matmul(a[..., 0], b[..., 0]) - np.matmul(a[..., 1], b[..., 1]),
+ np.matmul(a[..., 0], b[..., 1]) + np.matmul(a[..., 1], b[..., 0]))
+ if conj_a and not conj_b:
+ return complex(
+ np.matmul(a[..., 0], b[..., 0]) + np.matmul(a[..., 1], b[..., 1]),
+ np.matmul(a[..., 0], b[..., 1]) - np.matmul(a[..., 1], b[..., 0]))
+ if not conj_a and conj_b:
+ return complex(
+ np.matmul(a[..., 0], b[..., 0]) + np.matmul(a[..., 1], b[..., 1]),
+ np.matmul(a[..., 1], b[..., 0]) - np.matmul(a[..., 0], b[..., 1]))
+ if conj_a and conj_b:
+ return complex(
+ np.matmul(a[..., 0], b[..., 0]) - np.matmul(a[..., 1], b[..., 1]),
+ -np.matmul(a[..., 0], b[..., 1]) - np.matmul(a[..., 1], b[..., 0]))
+
+
+@tensor_operation
+def mtimes_real_complex(a: np.array, b: np.array, conj_b=False):
+ if is_real(b):
+ raise ValueError('Incorrect dimensions.')
+
+ if not conj_b:
+ return complex(np.matmul(a, b[..., 0]), np.matmul(a, b[..., 1]))
+ if conj_b:
+ return complex(np.matmul(a, b[..., 0]), -np.matmul(a, b[..., 1]))
+
+
+@tensor_operation
+def mtimes_complex_real(a: np.array, b: np.array, conj_a=False):
+ if is_real(a):
+ raise ValueError('Incorrect dimensions.')
+
+ if not conj_a:
+ return complex(np.matmul(a[..., 0], b), np.matmul(a[..., 1], b))
+ if conj_a:
+ return complex(np.matmul(a[..., 0], b), -np.matmul(a[..., 1], b))
+
+
+@tensor_operation
+def exp_imag(a: np.array):
+ """Complex exponential with imaginary input: e^(i*a)"""
+
+ a = np.expand_dims(a, -1)
+ return np.concatenate((np.cos(a), np.sin(a)), -1)
diff --git a/PaddleCV/tracking/pytracking/libs/dcf.py b/PaddleCV/tracking/pytracking/libs/dcf.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aaa70c54253d80a7a1d8e470c6e840464a3b23c
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/libs/dcf.py
@@ -0,0 +1,137 @@
+import math
+import numpy as np
+from pytracking.libs import fourier
+from pytracking.libs import complex
+from pytracking.libs.paddle_utils import _padding
+
+
+def hann1d(sz: int, centered=True) -> np.ndarray:
+ """1D cosine window."""
+ if centered:
+ return 0.5 * (1 - np.cos(
+ (2 * math.pi / (sz + 2)) * np.arange(1, sz + 1, 1, 'float32')))
+ w = 0.5 * (1 + np.cos(
+ (2 * math.pi / (sz + 2)) * np.arange(0, sz // 2 + 1, 1, 'float32')))
+ return np.concatenate([w, np.flip(w[1:sz - sz // 2], 0)])
+
+
+def hann2d(sz: np.ndarray, centered=True) -> np.ndarray:
+ """2D cosine window."""
+ return np.reshape(hann1d(sz[0], centered), (1, 1, -1, 1)) * \
+ np.reshape(hann1d(sz[1], centered), (1, 1, 1, -1))
+
+
+def hann2d_clipped(sz: np.ndarray, effective_sz: np.ndarray,
+ centered=True) -> np.ndarray:
+ """1D clipped cosine window."""
+
+ # Ensure that the difference is even
+ effective_sz += (effective_sz - sz) % 2
+ effective_window = np.reshape(hann1d(effective_sz[0], True), (1, 1, -1, 1)) * \
+ np.reshape(hann1d(effective_sz[1], True), (1, 1, 1, -1))
+
+ pad = np.int32((sz - effective_sz) / 2)
+ window = _padding(
+ effective_window, (pad[1], pad[1], pad[0], pad[0]), mode='replicate')
+
+ if centered:
+ return window
+ else:
+ mid = np.int32((sz / 2))
+ window_shift_lr = np.concatenate(
+ (window[..., mid[1]:], window[..., :mid[1]]), 3)
+ return np.concatenate((window_shift_lr[..., mid[0]:, :],
+ window_shift_lr[..., :mid[0], :]), 2)
+
+
+def gauss_fourier(sz: int, sigma: float, half: bool=False) -> np.ndarray:
+ if half:
+ k = np.arange(0, int(sz / 2 + 1), 1, 'float32')
+ else:
+ k = np.arange(-int((sz - 1) / 2), int(sz / 2 + 1), 1, 'float32')
+ return (math.sqrt(2 * math.pi) * sigma / sz) * np.exp(-2 * np.square(
+ math.pi * sigma * k / sz))
+
+
+def gauss_spatial(sz, sigma, center=0, end_pad=0):
+ k = np.arange(-(sz - 1) / 2, (sz + 1) / 2 + end_pad, 1, 'float32')
+ return np.exp(-1.0 / (2 * sigma**2) * np.square(k - center))
+
+
+def label_function(sz: np.ndarray, sigma: np.ndarray):
+ return np.reshape(gauss_fourier(sz[0], sigma[0]), (1, 1, -1, 1)) * \
+ np.reshape(gauss_fourier(sz[1], sigma[1], True), (1, 1, 1, -1))
+
+
+def label_function_spatial(sz: np.ndarray,
+ sigma: np.ndarray,
+ center: np.ndarray=None,
+ end_pad: np.ndarray=None):
+ """The origin is in the middle of the image."""
+ if center is None: center = np.zeros((2, ), 'float32')
+ if end_pad is None: end_pad = np.zeros((2, ), 'float32')
+ return np.reshape(gauss_spatial(sz[0], sigma[0], center[0], end_pad[0]), (1, 1, -1, 1)) * \
+ np.reshape(gauss_spatial(sz[1], sigma[1], center[1], end_pad[1]), (1, 1, 1, -1))
+
+
+def cubic_spline_fourier(f, a):
+ """The continuous Fourier transform of a cubic spline kernel."""
+
+ bf = (6 * (1 - np.cos(2 * math.pi * f)) + 3 * a * (1 - np.cos(4 * math.pi * f))
+ - (6 + 8 * a) * math.pi * f * np.sin(2 * math.pi * f) - 2 * a * math.pi * f * np.sin(4 * math.pi * f)) \
+ / (4 * math.pi ** 4 * f ** 4)
+ bf[f == 0] = 1
+ return bf
+
+
+def get_interp_fourier(sz: np.ndarray,
+ method='ideal',
+ bicubic_param=0.5,
+ centering=True,
+ windowing=False,
+ device='cpu'):
+ ky, kx = fourier.get_frequency_coord(sz)
+
+ if method == 'ideal':
+ interp_y = np.ones(ky.shape) / sz[0]
+ interp_x = np.ones(kx.shape) / sz[1]
+ elif method == 'bicubic':
+ interp_y = cubic_spline_fourier(ky / sz[0], bicubic_param) / sz[0]
+ interp_x = cubic_spline_fourier(kx / sz[1], bicubic_param) / sz[1]
+ else:
+ raise ValueError('Unknown method.')
+
+ if centering:
+ interp_y = complex.mult(interp_y,
+ complex.exp_imag((-math.pi / sz[0]) * ky))
+ interp_x = complex.mult(interp_x,
+ complex.exp_imag((-math.pi / sz[1]) * kx))
+
+ if windowing:
+ raise NotImplementedError
+
+ return interp_y, interp_x
+
+
+def interpolate_dft(a: np.ndarray, interp_fs) -> np.ndarray:
+ if isinstance(interp_fs, np.ndarray):
+ return complex.mult(a, interp_fs)
+ if isinstance(interp_fs, (tuple, list)):
+ return complex.mult(complex.mult(a, interp_fs[0]), interp_fs[1])
+ raise ValueError('"interp_fs" must be tensor or tuple of tensors.')
+
+
+def max2d(a: np.ndarray) -> (np.ndarray, np.ndarray):
+ """Computes maximum and argmax in the last two dimensions."""
+ argmax_row = np.argmax(a, axis=-2)
+ max_val_row = np.max(a, axis=-2)
+ argmax_col = np.argmax(max_val_row, axis=-1)
+ max_val = np.max(max_val_row, axis=-1)
+
+ argmax_row = np.reshape(argmax_row, (
+ argmax_col.size, -1))[np.arange(argmax_col.size), argmax_col.flatten()]
+ argmax_row = argmax_row.reshape(argmax_col.shape)
+ argmax = np.concatenate(
+ (np.expand_dims(argmax_row, -1), np.expand_dims(argmax_col, -1)), -1)
+
+ return max_val, argmax
diff --git a/PaddleCV/tracking/pytracking/libs/fourier.py b/PaddleCV/tracking/pytracking/libs/fourier.py
new file mode 100644
index 0000000000000000000000000000000000000000..d515db34901320cf4f2a20d0ad9b5e9de92dd03e
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/libs/fourier.py
@@ -0,0 +1,163 @@
+import numpy as np
+
+from pytracking.libs import complex, TensorList
+from pytracking.libs.tensorlist import tensor_operation
+from pytracking.libs.paddle_utils import _padding
+
+
+@tensor_operation
+def rfftshift2(a: np.array):
+ h = a.shape[2] + 2
+ return np.concatenate([a[:, :, (h - 1) // 2:], a[:, :, :h // 2]], 2)
+
+
+@tensor_operation
+def irfftshift2(a: np.array):
+ mid = int((a.shape[2] - 1) / 2)
+ return np.concatenate([a[:, :, mid:], a[:, :, :mid]], 2)
+
+
+@tensor_operation
+def cfft2(a):
+ """Do FFT and center the low frequency component.
+ Always produces odd (full) output sizes."""
+ out = rfftshift2(np.fft.rfft2(a))
+ return np.stack([out.real, out.imag], axis=-1)
+
+
+@tensor_operation
+def cifft2(a, signal_sizes=None):
+ """Do inverse FFT corresponding to cfft2."""
+ out = irfftshift2(a)
+ return np.fft.irfft2(out[..., 0] + 1j * out[..., 1], s=signal_sizes)
+
+
+@tensor_operation
+def sample_fs(a: np.array, grid_sz: np.array=None, rescale=True):
+ """Samples the Fourier series."""
+
+ # Size of the fourier series
+ sz = np.array([a.shape[2], 2 * a.shape[3] - 1], 'float32')
+
+ # Default grid
+ if grid_sz is None or sz[0] == grid_sz[0] and sz[1] == grid_sz[1]:
+ if rescale:
+ return np.prod(sz) * cifft2(a)
+ return cifft2(a)
+
+ if sz[0] > grid_sz[0] or sz[1] > grid_sz[1]:
+ raise ValueError(
+ "Only grid sizes that are smaller than the Fourier series size are supported."
+ )
+
+ tot_pad = (grid_sz - sz).tolist()
+ is_even = [s % 2 == 0 for s in sz]
+
+ # Compute paddings
+ pad_top = int((tot_pad[0] + 1) / 2) if is_even[0] else int(tot_pad[0] / 2)
+ pad_bottom = int(tot_pad[0] - pad_top)
+ pad_right = int((tot_pad[1] + 1) / 2)
+
+ if rescale:
+ return np.prod(grid_sz) * cifft2(
+ _padding(a, (0, 0, 0, pad_right, pad_top, pad_bottom)),
+ signal_sizes=grid_sz.astype('long').tolist())
+ else:
+ return cifft2(
+ _padding(a, (0, 0, 0, pad_right, pad_top, pad_bottom)),
+ signal_sizes=grid_sz.astype('long').tolist())
+
+
+def get_frequency_coord(sz, add_complex_dim=False, device='cpu'):
+ """Frequency coordinates."""
+
+ ky = np.reshape(
+ np.arange(
+ -int((sz[0] - 1) / 2), int(sz[0] / 2 + 1), dtype='float32'),
+ (1, 1, -1, 1))
+ kx = np.reshape(
+ np.arange(
+ 0, int(sz[1] / 2 + 1), dtype='float32'), (1, 1, 1, -1))
+
+ if add_complex_dim:
+ ky = np.expand_dims(ky, -1)
+ kx = np.expand_dims(kx, -1)
+
+ return ky, kx
+
+
+@tensor_operation
+def shift_fs(a: np.array, shift: np.array):
+ """Shift a sample a in the Fourier domain.
+ Params:
+ a : The fourier coefficiens of the sample.
+ shift : The shift to be performed normalized to the range [-pi, pi]."""
+
+ if a.ndim != 5:
+ raise ValueError(
+ 'a must be the Fourier coefficients, a 5-dimensional tensor.')
+
+ if shift[0] == 0 and shift[1] == 0:
+ return a
+
+ ky, kx = get_frequency_coord((a.shape[2], 2 * a.shape[3] - 1))
+
+ return complex.mult(
+ complex.mult(a, complex.exp_imag(shift[0] * ky)),
+ complex.exp_imag(shift[1] * kx))
+
+
+def sum_fs(a: TensorList) -> np.array:
+ """Sum a list of Fourier series expansions."""
+
+ s = None
+ mid = None
+
+ for e in sorted(a, key=lambda elem: elem.shape[-3], reverse=True):
+ if s is None:
+ s = e.copy()
+ mid = int((s.shape[-3] - 1) / 2)
+ else:
+ # Compute coordinates
+ top = mid - int((e.shape[-3] - 1) / 2)
+ bottom = mid + int(e.shape[-3] / 2) + 1
+ right = e.shape[-2]
+
+ # Add the data
+ s[..., top:bottom, :right, :] += e
+
+ return s
+
+
+def sum_fs12(a: TensorList) -> np.array:
+ """Sum a list of Fourier series expansions."""
+
+ s = None
+ mid = None
+
+ for e in sorted(a, key=lambda elem: elem.shape[0], reverse=True):
+ if s is None:
+ s = e.copy()
+ mid = int((s.shape[0] - 1) / 2)
+ else:
+ # Compute coordinates
+ top = mid - int((e.shape[0] - 1) / 2)
+ bottom = mid + int(e.shape[0] / 2) + 1
+ right = e.shape[1]
+
+ # Add the data
+ s[top:bottom, :right, ...] += e
+
+ return s
+
+
+@tensor_operation
+def inner_prod_fs(a: np.array, b: np.array):
+ if complex.is_complex(a) and complex.is_complex(b):
+ return 2 * (a.flatten() @b.flatten()
+ ) - a[:, :, :, 0, :].flatten() @b[:, :, :, 0, :].flatten()
+ elif complex.is_real(a) and complex.is_real(b):
+ return 2 * (a.flatten() @b.flatten()
+ ) - a[:, :, :, 0].flatten() @b[:, :, :, 0].flatten()
+ else:
+ raise NotImplementedError('Not implemented for mixed real and complex.')
diff --git a/PaddleCV/tracking/pytracking/libs/operation.py b/PaddleCV/tracking/pytracking/libs/operation.py
new file mode 100644
index 0000000000000000000000000000000000000000..62e5250a20a4124d05aed571cc8fda6f2100e3a2
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/libs/operation.py
@@ -0,0 +1,59 @@
+from paddle import fluid
+from paddle.fluid import layers
+from pytracking.libs.Fconv2d import Fconv2d
+from pytracking.libs.tensorlist import tensor_operation, TensorList
+from paddle.fluid.framework import Variable as PTensor
+
+
+@tensor_operation
+def conv2d(input: PTensor,
+ weight: PTensor,
+ bias: PTensor=None,
+ stride=1,
+ padding=0,
+ dilation=1,
+ groups=1,
+ mode=None):
+ """Standard conv2d. Returns the input if weight=None."""
+
+ if weight is None:
+ return input
+
+ ind = None
+ if mode is not None:
+ if padding != 0:
+ raise ValueError('Cannot input both padding and mode.')
+ if mode == 'same':
+ padding = (weight.shape[2] // 2, weight.shape[3] // 2)
+ if weight.shape[2] % 2 == 0 or weight.shape[3] % 2 == 0:
+ ind = (slice(-1)
+ if weight.shape[2] % 2 == 0 else slice(None), slice(-1)
+ if weight.shape[3] % 2 == 0 else slice(None))
+ elif mode == 'valid':
+ padding = (0, 0)
+ elif mode == 'full':
+ padding = (weight.shape[2] - 1, weight.shape[3] - 1)
+ else:
+ raise ValueError('Unknown mode for padding.')
+
+ assert bias is None
+ out = Fconv2d(
+ input,
+ weight,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ groups=groups)
+ if ind is None:
+ return out
+ return out[:, :, ind[0], ind[1]]
+
+
+@tensor_operation
+def conv1x1(input: PTensor, weight: PTensor):
+ """Do a convolution with a 1x1 kernel weights. Implemented with matmul, which can be faster than using conv."""
+
+ if weight is None:
+ return input
+
+ return Fconv2d(input, weight)
diff --git a/PaddleCV/tracking/pytracking/libs/optimization.py b/PaddleCV/tracking/pytracking/libs/optimization.py
new file mode 100644
index 0000000000000000000000000000000000000000..41236ad8c9f26451903020da77a5b3910345cd62
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/libs/optimization.py
@@ -0,0 +1,779 @@
+import numpy as np
+from paddle.fluid import layers
+from paddle import fluid
+from pytracking.libs.tensorlist import TensorList
+from pytracking.utils.plotting import plot_graph
+from pytracking.libs.paddle_utils import n2p, clone, static_clone
+
+
+class L2Problem:
+ """Base class for representing an L2 optimization problem."""
+
+ def __call__(self, x: TensorList) -> TensorList:
+ """Shall compute the residuals of the problem."""
+ raise NotImplementedError
+
+ def ip_input(self, a, b):
+ """Inner product of the input space."""
+ return sum(a.view(-1) @b.view(-1))
+
+ def ip_output(self, a, b):
+ """Inner product of the output space."""
+ return sum(a.view(-1) @b.view(-1))
+
+ def M1(self, x):
+ """M1 preconditioner."""
+ return x
+
+ def M2(self, x):
+ """M2 preconditioner."""
+ return x
+
+ def get_feed_dict(self):
+ raise NotImplementedError
+
+
+class MinimizationProblem:
+ """General minimization problem."""
+
+ def __call__(self, x: TensorList) -> TensorList:
+ """Shall compute the loss."""
+ raise NotImplementedError
+
+ def ip_input(self, a, b):
+ """Inner product of the input space."""
+ return sum(a.view(-1) @b.view(-1))
+
+ def M1(self, x):
+ return x
+
+ def M2(self, x):
+ return x
+
+ def get_feed_dict(self):
+ raise NotImplementedError
+
+
+class ConjugateGradientBase:
+ """Conjugate Gradient optimizer base class. Implements the CG loop."""
+
+ def __init__(self,
+ fletcher_reeves=True,
+ standard_alpha=True,
+ direction_forget_factor=0,
+ debug=False):
+ self.fletcher_reeves = fletcher_reeves
+ self.standard_alpha = standard_alpha
+ self.direction_forget_factor = direction_forget_factor
+ self.debug = debug
+
+ # State
+ self.p = None
+ self.rho = np.ones((1, ), 'float32')
+ self.r_prev = None
+
+ # Right hand side
+ self.b = None
+
+ def reset_state(self):
+ self.p = None
+ self.rho = np.ones((1, ), 'float32')
+ self.r_prev = None
+
+ def run_CG(self, num_iter, x=None, eps=0.0):
+ """Main conjugate gradient method.
+
+ args:
+ num_iter: Number of iterations.
+ x: Initial guess. Assumed zero if None.
+ eps: Stop if the residual norm gets smaller than this.
+ """
+
+ # Apply forgetting factor
+ if self.direction_forget_factor == 0:
+ self.reset_state()
+ elif self.p is not None:
+ self.rho /= self.direction_forget_factor
+
+ if x is None:
+ r = self.b.clone()
+ else:
+ r = self.b - self.A(x)
+
+ # Norms of residuals etc for debugging
+ resvec = None
+
+ # Loop over iterations
+ for ii in range(num_iter):
+ # Preconditioners
+ y = self.M1(r)
+ z = self.M2(y)
+
+ rho1 = self.rho
+ self.rho = self.ip(r, z)
+
+ if self.check_zero(self.rho):
+ if self.debug:
+ print('Stopped CG since rho = 0')
+ if resvec is not None:
+ resvec = resvec[:ii + 1]
+ return x, resvec
+
+ if self.p is None:
+ self.p = z.clone()
+ else:
+ if self.fletcher_reeves:
+ beta = self.rho / rho1
+ else:
+ rho2 = self.ip(self.r_prev, z)
+ beta = (self.rho - rho2) / rho1
+
+ beta = beta.apply(lambda a: np.clip(a, 0, 1e10))
+ self.p = z + self.p * beta
+
+ q = self.A(self.p)
+ pq = self.ip(self.p, q)
+
+ if self.standard_alpha:
+ alpha = self.rho / pq
+ else:
+ alpha = self.ip(self.p, r) / pq
+
+ # Save old r for PR formula
+ if not self.fletcher_reeves:
+ self.r_prev = r.clone()
+
+ # Form new iterate
+ if x is None:
+ x = self.p * alpha
+ else:
+ x += self.p * alpha
+
+ if ii < num_iter - 1 or self.debug:
+ r -= q * alpha
+
+ if eps > 0.0 or self.debug:
+ normr = self.residual_norm(r)
+
+ # if self.debug:
+ if True:
+ self.evaluate_CG_iteration(x)
+ # resvec[ii + 1] = normr
+
+ if eps > 0 and normr <= eps:
+ if self.debug:
+ print('Stopped CG since norm smaller than eps')
+ break
+
+ if resvec is not None:
+ resvec = resvec[:ii + 2]
+
+ return x, resvec
+
+ def A(self, x):
+ # Implements the left hand operation
+ raise NotImplementedError
+
+ def ip(self, a, b):
+ # Implements the inner product
+ return a.view(-1) @b.view(-1)
+
+ def residual_norm(self, r):
+ res = self.ip(r, r).sum()
+ if isinstance(res, (TensorList, list, tuple)):
+ res = sum(res)
+ return np.sqrt(res)
+
+ def check_zero(self, s, eps=0.0):
+ ss = s.abs() <= eps
+ if isinstance(ss, (TensorList, list, tuple)):
+ ss = sum(ss)
+ return ss > 0
+
+ def M1(self, x):
+ # M1 preconditioner
+ return x
+
+ def M2(self, x):
+ # M2 preconditioner
+ return x
+
+ def evaluate_CG_iteration(self, x):
+ pass
+
+
+class ConjugateGradient(ConjugateGradientBase):
+ """Conjugate Gradient optimizer, performing single linearization of the residuals in the start."""
+
+ def __init__(self,
+ problem: L2Problem,
+ variable: TensorList,
+ cg_eps=0.0,
+ fletcher_reeves=True,
+ standard_alpha=True,
+ direction_forget_factor=0,
+ debug=False,
+ analyze=False,
+ plotting=False,
+ fig_num=(10, 11)):
+ super().__init__(fletcher_reeves, standard_alpha,
+ direction_forget_factor, debug or plotting)
+
+ self.problem = problem
+ self.x = variable
+
+ self.plotting = plotting
+ self.fig_num = fig_num
+
+ self.cg_eps = cg_eps
+ self.f0 = None
+ self.g = None
+ self.dfdxt_g = None
+
+ self.residuals = np.zeros(0)
+ self.losses = np.zeros(0)
+ self._construct_graph()
+ self.analyze_convergence = analyze
+
+ def clear_temp(self):
+ pass
+
+ def _construct_graph(self):
+ train_program = fluid.Program()
+ start_program = fluid.Program()
+ with fluid.program_guard(train_program, start_program):
+ scope = 'first/'
+ self.x_ph = TensorList([
+ fluid.layers.data(
+ '{}x_{}'.format(scope, idx),
+ v.shape,
+ append_batch_size=False,
+ stop_gradient=False) for idx, v in enumerate(self.x)
+ ])
+ self.p_ph = TensorList([
+ fluid.layers.data(
+ '{}p_{}'.format(scope, idx),
+ v.shape,
+ append_batch_size=False,
+ stop_gradient=False) for idx, v in enumerate(self.x)
+ ])
+
+ # problem forward
+ self.f0 = self.problem(self.x_ph, scope)
+
+ self.g = self.f0.apply(static_clone)
+ # self.g = self.f0
+
+ # Get df/dx^t @ f0
+ self.dfdxt_g = TensorList(
+ fluid.gradients(self.f0, self.x_ph, self.g))
+
+ # For computing A
+ tmp = [a * b for a, b in zip(self.dfdxt_g, self.p_ph)]
+ self.dfdx_x = TensorList(fluid.gradients(tmp, self.g))
+ # self.dfdx_x = TensorList(fluid.gradients(self.dfdxt_g, self.g, self.p_ph))
+
+ train_program2 = fluid.Program()
+ start_program2 = fluid.Program()
+ with fluid.program_guard(train_program2, start_program2):
+ scope = 'second/'
+ self.x_ph_2 = TensorList([
+ fluid.layers.data(
+ '{}x_{}'.format(scope, idx),
+ v.shape,
+ append_batch_size=False,
+ stop_gradient=False) for idx, v in enumerate(self.x)
+ ])
+ self.dfdx_x_ph = TensorList([
+ fluid.layers.data(
+ '{}dfdx_x_{}'.format(scope, idx),
+ v.shape,
+ append_batch_size=False,
+ stop_gradient=False) for idx, v in enumerate(self.g)
+ ])
+
+ self.f0_2 = self.problem(self.x_ph_2, scope)
+ self.dfdx_dfdx = TensorList(
+ fluid.gradients(self.f0_2 * self.dfdx_x_ph, self.x_ph_2))
+
+ place = fluid.CUDAPlace(0)
+ self.exe = fluid.Executor(place)
+ self.exe.run(program=fluid.default_startup_program())
+ self.compiled_prog = fluid.compiler.CompiledProgram(train_program)
+
+ place2 = fluid.CUDAPlace(0)
+ self.exe2 = fluid.Executor(place2)
+ self.exe2.run(program=fluid.default_startup_program())
+ self.compiled_prog2 = fluid.compiler.CompiledProgram(train_program2)
+
+ def get_dfdxt_g(self):
+ scope = 'first/'
+ feed_dict = self.problem.get_feed_dict(scope)
+ # add variable feed
+ for idx, v in enumerate(self.x):
+ feed_dict['{}x_{}'.format(scope, idx)] = v
+ for idx, v in enumerate(self.x):
+ feed_dict['{}p_{}'.format(scope, idx)] = v
+ res = self.exe.run(self.compiled_prog,
+ feed=feed_dict,
+ fetch_list=[v.name for v in self.dfdxt_g])
+ return TensorList(res)
+
+ def run(self, num_cg_iter):
+ """Run the oprimizer with the provided number of iterations."""
+
+ if num_cg_iter == 0:
+ return
+
+ # Get the right hand side
+ self.b = -self.get_dfdxt_g()
+
+ self.evaluate_CG_iteration(0)
+
+ # Run CG
+ delta_x, res = self.run_CG(num_cg_iter, eps=self.cg_eps)
+
+ self.x += delta_x
+
+ # reset problem training samples
+ self.problem.training_samples_stack = None
+
+ def A(self, x):
+ # First pass
+ scope = 'first/'
+ feed_dict = self.problem.get_feed_dict(scope)
+ # add variable feed
+ for idx, v in enumerate(self.x):
+ feed_dict['{}x_{}'.format(scope, idx)] = v
+ # add p feed
+ for idx, v in enumerate(x):
+ feed_dict['{}p_{}'.format(scope, idx)] = v
+
+ dfdx_x = TensorList(
+ self.exe.run(self.compiled_prog,
+ feed=feed_dict,
+ fetch_list=[v.name for v in self.dfdx_x]))
+
+ # Second pass
+ scope = 'second/'
+ feed_dict = self.problem.get_feed_dict(scope)
+ # add variable feed
+ for idx, v in enumerate(self.x):
+ feed_dict['{}x_{}'.format(scope, idx)] = v
+ # add p feed
+ for idx, v in enumerate(dfdx_x):
+ feed_dict['{}dfdx_x_{}'.format(scope, idx)] = v
+
+ res = TensorList(
+ self.exe2.run(self.compiled_prog2,
+ feed=feed_dict,
+ fetch_list=[v.name for v in self.dfdx_dfdx]))
+
+ return res
+
+ def ip(self, a, b):
+ return self.problem.ip_input(a, b)
+
+ def M1(self, x):
+ return self.problem.M1(x)
+
+ def M2(self, x):
+ return self.problem.M2(x)
+
+ def evaluate_CG_iteration(self, delta_x):
+ if self.analyze_convergence:
+ scope = 'first/'
+ x = self.x + delta_x
+ feed_dict = self.problem.get_feed_dict(scope)
+ for idx, v in enumerate(x):
+ feed_dict['{}x_{}'.format(scope, idx)] = v
+ for idx, v in enumerate(x):
+ feed_dict['{}p_{}'.format(scope, idx)] = v
+ res = self.exe.run(self.compiled_prog,
+ feed=feed_dict,
+ fetch_list=[v.name for v in self.f0])
+ res = TensorList(res)
+ loss = self.problem.ip_output(res, res)
+ #print('Paddle Loss: {}'.format(loss))
+
+
+class GaussNewtonCG(ConjugateGradientBase):
+ """Gauss-Newton with Conjugate Gradient optimizer."""
+
+ def __init__(self,
+ problem: L2Problem,
+ variable: TensorList,
+ cg_eps=0.0,
+ fletcher_reeves=True,
+ standard_alpha=True,
+ direction_forget_factor=0,
+ debug=False,
+ analyze=False,
+ plotting=False,
+ fig_num=(10, 11, 12)):
+ super().__init__(fletcher_reeves, standard_alpha,
+ direction_forget_factor, debug or analyze or plotting)
+
+ self.problem = problem
+ self.x = variable
+
+ self.analyze_convergence = analyze
+ self.plotting = plotting
+ self.fig_num = fig_num
+
+ self.cg_eps = cg_eps
+ self.f0 = None
+ self.g = None
+ self.dfdxt_g = None
+
+ self.residuals = np.zeros(0)
+ self.losses = np.zeros(0)
+ self.gradient_mags = np.zeros(0)
+ self._construct_graph()
+
+ def clear_temp(self):
+ self.f0 = None
+ self.g = None
+ self.dfdxt_g = None
+
+ def run_GN(self, *args, **kwargs):
+ return self.run(*args, **kwargs)
+
+ def _construct_graph(self):
+ train_program = fluid.Program()
+ start_program = fluid.Program()
+ with fluid.program_guard(train_program, start_program):
+ scope = 'first/'
+ self.x_ph = TensorList([
+ fluid.layers.data(
+ '{}x_{}'.format(scope, idx),
+ v.shape,
+ append_batch_size=False,
+ stop_gradient=False) for idx, v in enumerate(self.x)
+ ])
+ self.p_ph = TensorList([
+ fluid.layers.data(
+ '{}p_{}'.format(scope, idx),
+ v.shape,
+ append_batch_size=False,
+ stop_gradient=False) for idx, v in enumerate(self.x)
+ ])
+
+ # problem forward
+ self.f0 = self.problem(self.x_ph, scope)
+
+ self.g = self.f0.apply(static_clone)
+
+ # Get df/dx^t @ f0
+ self.dfdxt_g = TensorList(
+ fluid.gradients(self.f0, self.x_ph, self.g))
+
+ # For computing A
+ tmp = [a * b for a, b in zip(self.dfdxt_g, self.p_ph)]
+ self.dfdx_x = TensorList(fluid.gradients(tmp, self.g))
+ # self.dfdx_x = TensorList(fluid.gradients(self.dfdxt_g, self.g, self.p_ph))
+
+ train_program2 = fluid.Program()
+ start_program2 = fluid.Program()
+ with fluid.program_guard(train_program2, start_program2):
+ scope = 'second/'
+ self.x_ph_2 = TensorList([
+ fluid.layers.data(
+ '{}x_{}'.format(scope, idx),
+ v.shape,
+ append_batch_size=False,
+ stop_gradient=False) for idx, v in enumerate(self.x)
+ ])
+ self.dfdx_x_ph = TensorList([
+ fluid.layers.data(
+ '{}dfdx_x_{}'.format(scope, idx),
+ v.shape,
+ append_batch_size=False,
+ stop_gradient=False) for idx, v in enumerate(self.g)
+ ])
+
+ self.f0_2 = self.problem(self.x_ph_2, scope)
+ self.dfdx_dfdx = TensorList(
+ fluid.gradients(self.f0_2 * self.dfdx_x_ph, self.x_ph_2))
+
+ place = fluid.CUDAPlace(0)
+ self.exe = fluid.Executor(place)
+ self.exe.run(program=fluid.default_startup_program())
+ self.compiled_prog = fluid.compiler.CompiledProgram(train_program)
+
+ place2 = fluid.CUDAPlace(0)
+ self.exe2 = fluid.Executor(place2)
+ self.exe2.run(program=fluid.default_startup_program())
+ self.compiled_prog2 = fluid.compiler.CompiledProgram(train_program2)
+
+ def get_dfdxt_g(self):
+ scope = 'first/'
+ feed_dict = self.problem.get_feed_dict(scope)
+ # add variable feed
+ for idx, v in enumerate(self.x):
+ feed_dict['{}x_{}'.format(scope, idx)] = v
+ for idx, v in enumerate(self.x):
+ feed_dict['{}p_{}'.format(scope, idx)] = v
+ res = self.exe.run(self.compiled_prog,
+ feed=feed_dict,
+ fetch_list=[v.name for v in self.dfdxt_g])
+ return TensorList(res)
+
+ def run(self, num_cg_iter, num_gn_iter=None):
+ """Run the optimizer.
+ args:
+ num_cg_iter: Number of CG iterations per GN iter. If list, then each entry specifies number of CG iterations
+ and number of GN iterations is given by the length of the list.
+ num_gn_iter: Number of GN iterations. Shall only be given if num_cg_iter is an integer.
+ """
+
+ if isinstance(num_cg_iter, int):
+ if num_gn_iter is None:
+ raise ValueError(
+ 'Must specify number of GN iter if CG iter is constant')
+ num_cg_iter = [num_cg_iter] * num_gn_iter
+
+ num_gn_iter = len(num_cg_iter)
+ if num_gn_iter == 0:
+ return
+
+ if self.analyze_convergence:
+ self.evaluate_CG_iteration(0)
+
+ # Outer loop for running the GN iterations.
+ for cg_iter in num_cg_iter:
+ self.run_GN_iter(cg_iter)
+
+ # reset problem training samples
+ self.problem.training_samples_stack = None
+ return self.losses, self.residuals
+
+ def run_GN_iter(self, num_cg_iter):
+ """Runs a single GN iteration."""
+
+ self.b = -self.get_dfdxt_g()
+
+ # Run CG
+ if num_cg_iter > 0:
+ delta_x, res = self.run_CG(num_cg_iter, eps=self.cg_eps)
+ self.x += delta_x
+
+ def A(self, x):
+ # First pass
+ scope = 'first/'
+ feed_dict = self.problem.get_feed_dict(scope)
+ # add variable feed
+ for idx, v in enumerate(self.x):
+ feed_dict['{}x_{}'.format(scope, idx)] = v
+ # add p feed
+ for idx, v in enumerate(x):
+ feed_dict['{}p_{}'.format(scope, idx)] = v
+
+ dfdx_x = TensorList(
+ self.exe.run(self.compiled_prog,
+ feed=feed_dict,
+ fetch_list=[v.name for v in self.dfdx_x]))
+
+ # Second pass
+ scope = 'second/'
+ feed_dict = self.problem.get_feed_dict(scope)
+ # add variable feed
+ for idx, v in enumerate(self.x):
+ feed_dict['{}x_{}'.format(scope, idx)] = v
+ # add p feed
+ for idx, v in enumerate(dfdx_x):
+ feed_dict['{}dfdx_x_{}'.format(scope, idx)] = v
+
+ res = TensorList(
+ self.exe2.run(self.compiled_prog2,
+ feed=feed_dict,
+ fetch_list=[v.name for v in self.dfdx_dfdx]))
+
+ return res
+
+ def ip(self, a, b):
+ return self.problem.ip_input(a, b)
+
+ def M1(self, x):
+ return self.problem.M1(x)
+
+ def M2(self, x):
+ return self.problem.M2(x)
+
+ def evaluate_CG_iteration(self, delta_x):
+ if self.analyze_convergence:
+ scope = 'first/'
+ x = self.x + delta_x
+ feed_dict = self.problem.get_feed_dict(scope)
+ for idx, v in enumerate(x):
+ feed_dict['{}x_{}'.format(scope, idx)] = v
+ for idx, v in enumerate(x):
+ feed_dict['{}p_{}'.format(scope, idx)] = v
+ res = self.exe.run(self.compiled_prog,
+ feed=feed_dict,
+ fetch_list=[v.name for v in self.f0])
+ res = TensorList(res)
+ loss = self.problem.ip_output(res, res)
+ #print('Paddle Loss: {}'.format(loss))
+
+
+class GradientDescentL2:
+ """Gradient descent with momentum for L2 problems."""
+
+ def __init__(self,
+ problem: L2Problem,
+ variable: TensorList,
+ step_length: float,
+ momentum: float=0.0,
+ debug=False,
+ plotting=False,
+ fig_num=(10, 11)):
+
+ self.problem = problem
+ self.x = variable # Numpy arrays
+
+ self.step_legnth = step_length
+ self.momentum = momentum
+
+ self.debug = debug or plotting
+ self.plotting = plotting
+ self.fig_num = fig_num
+
+ self.losses = np.zeros(0)
+ self.gradient_mags = np.zeros(0)
+ self.residuals = None
+
+ self.clear_temp()
+ self._construct_graph()
+
+ def clear_temp(self):
+ self.f0 = None
+ self.dir = None
+
+ def _construct_graph(self):
+ train_program = fluid.Program()
+ start_program = fluid.Program()
+ with fluid.program_guard(train_program, start_program):
+ self.x_ph = TensorList([
+ fluid.layers.data(
+ 'x_{}'.format(idx),
+ v.shape,
+ append_batch_size=False,
+ stop_gradient=False) for idx, v in enumerate(self.x)
+ ])
+
+ # problem forward
+ self.f0 = self.problem(self.x_ph)
+ self.loss = self.problem.ip_output(self.f0, self.f0)
+ # problem backward
+ self.grad = TensorList(fluid.gradients(self.loss, self.x_ph))
+
+ place = fluid.CUDAPlace(0)
+ self.exe = fluid.Executor(place)
+ self.exe.run(program=fluid.default_startup_program())
+ self.compiled_prog = fluid.compiler.CompiledProgram(train_program)
+
+ def get_feed_dict(self, x_list):
+ feed_dict = self.problem.get_feed_dict()
+ # add variable feed
+ for idx, v in enumerate(x_list):
+ feed_dict['x_{}'.format(idx)] = v
+ return feed_dict
+
+ def run(self, num_iter, dummy=None):
+ if num_iter == 0:
+ return
+
+ grad_names = [v.name for v in self.grad]
+ for i in range(num_iter):
+ res = self.exe.run(self.compiled_prog,
+ feed=self.get_feed_dict(self.x),
+ fetch_list=[self.loss.name] + grad_names)
+ if self.debug:
+ loss = res[0]
+ #print('Paddle Loss: {}'.format(loss))
+
+ grad = TensorList(res[1:])
+
+ # update parameters
+ if self.dir is None:
+ self.dir = grad
+ else:
+ self.dir = grad + self.momentum * self.dir
+ self.x = self.x - self.step_legnth * self.dir
+
+ # reset problem training samples
+ self.problem.training_samples_stack = None
+
+
+class GradientDescent:
+ """Gradient descent for general minimization problems."""
+
+ def __init__(self,
+ problem: MinimizationProblem,
+ variable: TensorList,
+ step_length: float,
+ momentum: float=0.0,
+ debug=False,
+ plotting=False,
+ fig_num=(10, 11)):
+
+ self.problem = problem
+ self.x = variable
+
+ self.step_legnth = step_length
+ self.momentum = momentum
+
+ self.debug = debug or plotting
+ self.plotting = plotting
+ self.fig_num = fig_num
+
+ self.losses = layers.zeros((0, ), 'float32')
+ self.gradient_mags = layers.zeros((0, ), 'float32')
+ self.residuals = None
+
+ self.clear_temp()
+
+ def clear_temp(self):
+ self.dir = None
+
+ def run(self, num_iter, dummy=None):
+
+ if num_iter == 0:
+ return
+
+ lossvec = None
+ if self.debug:
+ lossvec = np.zeros((num_iter + 1, ))
+ grad_mags = np.zeros((num_iter + 1, ))
+
+ for i in range(num_iter):
+ self.x.stop_gradient = False
+
+ # Evaluate function at current estimate
+ loss = self.problem(self.x)
+
+ # Compute grad
+ loss.backward()
+ grad = TensorList(self.x.gradient()).apply(n2p)
+ self.x.clear_gradient()
+
+ # Update direction
+ if self.dir is None:
+ self.dir = grad
+ else:
+ self.dir = grad + self.momentum * self.dir
+
+ self.x = self.x.detach()
+ self.x -= self.step_legnth * self.dir
+
+ if self.debug:
+ lossvec[i] = loss.numpy()
+ grad_mags[i] = self.problem.ip_input(
+ grad, grad).apply(layers.sqrt).numpy()
+
+ self.problem.training_samples_stack = None
+
+ self.x = self.x.detach()
+ self.x.stop_gradient = True
+ self.clear_temp()
diff --git a/PaddleCV/tracking/pytracking/libs/paddle_utils.py b/PaddleCV/tracking/pytracking/libs/paddle_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5fb99d2df6d571cc23525df5dd25a5c23b1bfe0
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/libs/paddle_utils.py
@@ -0,0 +1,218 @@
+import numpy as np
+import paddle
+from paddle.fluid import dygraph
+from paddle.fluid import layers
+from paddle.fluid.framework import Variable
+import cv2 as cv
+PTensor = Variable
+
+
+def broadcast_op(a, b, op='mul'):
+ a_expand_factors = []
+ b_expand_factors = []
+ assert len(a.shape) == len(
+ b.shape), 'a.shape = {} while b.shape = {}'.format(a.shape, b.shape)
+ for a_s, b_s in zip(a.shape, b.shape):
+ if a_s != b_s:
+ if a_s == 1:
+ a_expand_factors.append(b_s)
+ b_expand_factors.append(1)
+ elif b_s == 1:
+ a_expand_factors.append(1)
+ b_expand_factors.append(a_s)
+ else:
+ raise NotImplementedError
+ else:
+ a_expand_factors.append(1)
+ b_expand_factors.append(1)
+ if op == 'mul':
+ op = layers.elementwise_mul
+ elif op == 'add':
+ op = layers.elementwise_add
+ elif op == 'sub':
+ op = layers.elementwise_sub
+ elif op == 'div':
+ op = layers.elementwise_div
+ else:
+ raise NotImplementedError
+ return op(
+ layers.expand(a, a_expand_factors), layers.expand(b, b_expand_factors))
+
+
+def paddle_prod(x):
+ prod = 1
+ num_elems = x.shape[0]
+ for idx in range(num_elems):
+ prod *= x[idx]
+ return prod
+
+
+def n2p(x, dtype=None):
+ if dtype is None:
+ x = np.array(x)
+ if x.dtype == np.float64:
+ x = x.astype('float32')
+ else:
+ x = np.array(x, dtype=dtype)
+ return dygraph.to_variable(x)
+
+
+def p2n(x):
+ return x.numpy()
+
+
+def clone(x):
+ v = dygraph.to_variable(x.numpy())
+ v.stop_gradient = x.stop_gradient
+ return v
+
+
+def static_identity(x):
+ x = layers.reshape(x, x.shape)
+ return x
+
+
+def static_clone(x):
+ x1 = static_identity(x)
+ x1.stop_gradient = True
+ x2 = static_identity(x1)
+ x2.stop_gradient = x.stop_gradient
+ return x2
+
+
+def detach(x):
+ v = dygraph.to_variable(x.numpy())
+ v.stop_gradient = True
+ return v
+
+
+def squeeze(input, axes):
+ new_shape = []
+ for i, s in enumerate(input.shape):
+ if i in axes:
+ assert s == 1
+ else:
+ new_shape.append(s)
+ return layers.reshape(input, new_shape)
+
+
+def unsqueeze(input, axes):
+ new_shape = []
+ for i, s in enumerate(input.shape):
+ for a in axes:
+ if i == a:
+ new_shape.append(1)
+ new_shape.append(s)
+ return layers.reshape(input, new_shape)
+
+
+def crop(x, crops):
+ slices = []
+ for c in crops:
+ c1 = None if c[1] == 0 else -c[1]
+ slices.append(slice(c[0], c1))
+ return x[tuple(slices)]
+
+
+def _padding(x, pads, mode='constant'):
+ return_tensor = False
+ if isinstance(x, PTensor):
+ x = x.numpy()
+ return_tensor = True
+
+ assert len(pads) % 2 == 0
+ pads = list(pads) + [0] * (len(x.shape) * 2 - len(pads))
+
+ # convert to numpy pad format
+ pads_np, pad_per_dim = [], []
+ for i, p in enumerate(pads):
+ if i % 2 == 0:
+ pad_per_dim = [p]
+ else:
+ pad_per_dim.append(p)
+ pads_np.insert(0, pad_per_dim)
+
+ # handle negative pads (cropping)
+ pads_np_pos, pads_np_neg = [], []
+ for pad_per_dim in pads_np:
+ pad_per_dim_pos, pad_per_dim_neg = [], []
+ for p in pad_per_dim:
+ if p < 0:
+ pad_per_dim_pos.append(0)
+ pad_per_dim_neg.append(-p)
+ else:
+ pad_per_dim_pos.append(p)
+ pad_per_dim_neg.append(0)
+ pads_np_pos.append(pad_per_dim_pos)
+ pads_np_neg.append(pad_per_dim_neg)
+
+ # cropping
+ x = crop(x, pads_np_neg)
+
+ # padding
+ # if x is an image
+ if len(x.shape) == 3 and pads_np_pos[-1][0] == 0 and pads_np_pos[-1][
+ 1] == 0:
+ if mode == 'replicate':
+ pad_mode = cv.BORDER_REPLICATE
+ else:
+ pad_mode = cv.BORDER_CONSTANT
+ y1_pad, y2_pad = pads_np_pos[0]
+ x1_pad, x2_pad = pads_np_pos[1]
+ x = cv.copyMakeBorder(x, y1_pad, y2_pad, x1_pad, x2_pad, pad_mode)
+ else:
+ np_mode = 'edge' if mode == 'replicate' else 'constant'
+ x = np.pad(x, pads_np_pos, mode=np_mode)
+
+ out = dygraph.to_variable(x) if return_tensor else x
+ return out
+
+
+def mod(a, b):
+ arg_list, new_arg_list = [a, b], []
+ return_PTensor = False
+ for x in arg_list:
+ if isinstance(x, PTensor):
+ x = p2n(x)
+ return_PTensor = True
+ new_arg_list.append(x)
+
+ out = new_arg_list[0] % new_arg_list[1]
+ return n2p(out) if return_PTensor else out
+
+
+def floordiv(a, b):
+ arg_list, new_arg_list = [a, b], []
+ return_PTensor = False
+ for x in arg_list:
+ if isinstance(x, PTensor):
+ x = p2n(x)
+ return_PTensor = True
+ new_arg_list.append(x)
+
+ out = new_arg_list[0] // new_arg_list[1]
+ return n2p(out) if return_PTensor else out
+
+
+def stack_sum(x):
+ return layers.reduce_sum(layers.stack(x))
+
+
+def leaky_relu(x, alpha):
+ return layers.relu(x) + alpha * (-1 * layers.relu(-1 * x))
+
+
+def elu(x, alpha):
+ return layers.relu(x) + alpha * (layers.exp(-1 * layers.relu(-1 * x)) - 1)
+
+
+def dropout2d(input, prob, is_train=False):
+ if not is_train:
+ return input
+ channels = input.shape[1]
+ keep_prob = 1.0 - prob
+ random_tensor = keep_prob + layers.uniform_random_batch_size_like(
+ input, [-1, channels, 1, 1], min=0., max=1.)
+ binary_tensor = layers.floor(random_tensor)
+ output = input / keep_prob * binary_tensor
+ return output
diff --git a/PaddleCV/tracking/pytracking/libs/tensordict.py b/PaddleCV/tracking/pytracking/libs/tensordict.py
new file mode 100644
index 0000000000000000000000000000000000000000..10325dd64b2170b09baf7f13cd204c61706d63ae
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/libs/tensordict.py
@@ -0,0 +1,36 @@
+from collections import OrderedDict
+
+
+class TensorDict(OrderedDict):
+ """Container mainly used for dicts of Variable."""
+
+ def concat(self, other):
+ """Concatenates two dicts without copying internal data."""
+ return TensorDict(self, **other)
+
+ def copy(self):
+ return TensorDict(super(TensorDict, self).copy())
+
+ def __getattr__(self, name):
+ for n, e in self.items():
+ if not hasattr(e, name):
+ raise AttributeError('\'{}\' object has not attribute \'{}\''.
+ format(type(e), name))
+
+ def apply_attr(*args, **kwargs):
+ return TensorDict({
+ n: getattr(e, name)(*args, **kwargs) if hasattr(e, name) else e
+ for n, e in self.items()
+ })
+
+ return apply_attr
+
+ def attribute(self, attr: str, *args):
+ return TensorDict({n: getattr(e, attr, *args) for n, e in self.items()})
+
+ def apply(self, fn, *args, **kwargs):
+ return TensorDict({n: fn(e, *args, **kwargs) for n, e in self.items()})
+
+ @staticmethod
+ def _iterable(a):
+ return isinstance(a, (TensorDict, list))
diff --git a/PaddleCV/tracking/pytracking/libs/tensorlist.py b/PaddleCV/tracking/pytracking/libs/tensorlist.py
new file mode 100644
index 0000000000000000000000000000000000000000..24d6e3d2a9488e7c75286233e5c10929fa339f35
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/libs/tensorlist.py
@@ -0,0 +1,268 @@
+import functools
+import numpy as np
+from paddle.fluid import layers
+
+from pytracking.libs.paddle_utils import clone as clone_fn
+from pytracking.libs.paddle_utils import detach as detach_fn
+from pytracking.libs.paddle_utils import PTensor
+
+
+def matmul(a, b):
+ if isinstance(a, PTensor) or isinstance(b, PTensor):
+ return layers.matmul(a, b)
+ else:
+ return np.matmul(a, b)
+
+
+class TensorList(list):
+ """Container mainly used for lists of paddle tensors. Extends lists with paddle functionality."""
+
+ def __init__(self, list_of_tensors=list()):
+ super(TensorList, self).__init__(list_of_tensors)
+
+ def __getitem__(self, item):
+ if isinstance(item, int):
+ return super(TensorList, self).__getitem__(item)
+ elif isinstance(item, (tuple, list)):
+ return TensorList(
+ [super(TensorList, self).__getitem__(i) for i in item])
+ else:
+ return TensorList(super(TensorList, self).__getitem__(item))
+
+ def __add__(self, other):
+ if TensorList._iterable(other):
+ return TensorList([e1 + e2 for e1, e2 in zip(self, other)])
+ return TensorList([e + other for e in self])
+
+ def __radd__(self, other):
+ if TensorList._iterable(other):
+ return TensorList([e2 + e1 for e1, e2 in zip(self, other)])
+ return TensorList([other + e for e in self])
+
+ def __iadd__(self, other):
+ if TensorList._iterable(other):
+ for i, e2 in enumerate(other):
+ self[i] += e2
+ else:
+ for i in range(len(self)):
+ self[i] += other
+ return self
+
+ def __sub__(self, other):
+ if TensorList._iterable(other):
+ return TensorList([e1 - e2 for e1, e2 in zip(self, other)])
+ return TensorList([e - other for e in self])
+
+ def __rsub__(self, other):
+ if TensorList._iterable(other):
+ return TensorList([e2 - e1 for e1, e2 in zip(self, other)])
+ return TensorList([other - e for e in self])
+
+ def __isub__(self, other):
+ if TensorList._iterable(other):
+ for i, e2 in enumerate(other):
+ self[i] -= e2
+ else:
+ for i in range(len(self)):
+ self[i] -= other
+ return self
+
+ def __mul__(self, other):
+ if TensorList._iterable(other):
+ return TensorList([e1 * e2 for e1, e2 in zip(self, other)])
+ return TensorList([e * other for e in self])
+
+ def __rmul__(self, other):
+ if TensorList._iterable(other):
+ return TensorList([e2 * e1 for e1, e2 in zip(self, other)])
+ return TensorList([other * e for e in self])
+
+ def __imul__(self, other):
+ if TensorList._iterable(other):
+ for i, e2 in enumerate(other):
+ self[i] *= e2
+ else:
+ for i in range(len(self)):
+ self[i] *= other
+ return self
+
+ def __truediv__(self, other):
+ if TensorList._iterable(other):
+ return TensorList([e1 / e2 for e1, e2 in zip(self, other)])
+ return TensorList([e / other for e in self])
+
+ def __rtruediv__(self, other):
+ if TensorList._iterable(other):
+ return TensorList([e2 / e1 for e1, e2 in zip(self, other)])
+ return TensorList([other / e for e in self])
+
+ def __itruediv__(self, other):
+ if TensorList._iterable(other):
+ for i, e2 in enumerate(other):
+ self[i] /= e2
+ else:
+ for i in range(len(self)):
+ self[i] /= other
+ return self
+
+ def __matmul__(self, other):
+ if TensorList._iterable(other):
+ return TensorList([matmul(e1, e2) for e1, e2 in zip(self, other)])
+ return TensorList([matmul(e, other) for e in self])
+
+ def __rmatmul__(self, other):
+ if TensorList._iterable(other):
+ return TensorList([matmul(e2, e1) for e1, e2 in zip(self, other)])
+ return TensorList([matmul(other, e) for e in self])
+
+ def __imatmul__(self, other):
+ if TensorList._iterable(other):
+ for i, e2 in enumerate(other):
+ self[i] = matmul(self[i], e2)
+ else:
+ for i in range(len(self)):
+ self[i] = matmul(self[i], other)
+ return self
+
+ def __mod__(self, other):
+ if TensorList._iterable(other):
+ return TensorList([e1 % e2 for e1, e2 in zip(self, other)])
+ return TensorList([e % other for e in self])
+
+ def __rmod__(self, other):
+ if TensorList._iterable(other):
+ return TensorList([e2 % e1 for e1, e2 in zip(self, other)])
+ return TensorList([other % e for e in self])
+
+ def __pos__(self):
+ return TensorList([+e for e in self])
+
+ def __neg__(self):
+ return TensorList([-e for e in self])
+
+ def __le__(self, other):
+ if TensorList._iterable(other):
+ return TensorList([e1 <= e2 for e1, e2 in zip(self, other)])
+ return TensorList([e <= other for e in self])
+
+ def __ge__(self, other):
+ if TensorList._iterable(other):
+ return TensorList([e1 >= e2 for e1, e2 in zip(self, other)])
+ return TensorList([e >= other for e in self])
+
+ def view(self, *args):
+ def reshape(x):
+ if isinstance(x, PTensor):
+ return layers.reshape(x, args)
+ else:
+ return np.reshape(x, args)
+
+ return self.apply(reshape)
+
+ def clone(self):
+ def _clone(x):
+ if isinstance(x, PTensor):
+ return clone_fn(x)
+ else:
+ return x.copy()
+
+ return self.apply(_clone)
+
+ def detach(self):
+ return self.apply(detach_fn)
+
+ def sqrt(self):
+ def _sqrt(x):
+ if isinstance(x, PTensor):
+ return layers.sqrt(x)
+ else:
+ return np.sqrt(x)
+
+ return self.apply(_sqrt)
+
+ def abs(self):
+ def _abs(x):
+ if isinstance(x, PTensor):
+ return layers.abs(x)
+ else:
+ return np.abs(x)
+
+ return self.apply(_abs)
+
+ def size(self, axis=None):
+ def get_size(x):
+ if axis is None:
+ return x.shape
+ else:
+ return x.shape[axis]
+
+ return self.apply(get_size)
+
+ def concat(self, other):
+ return TensorList(super(TensorList, self).__add__(other))
+
+ def copy(self):
+ return TensorList(super(TensorList, self).copy())
+
+ def unroll(self):
+ if not any(isinstance(t, TensorList) for t in self):
+ return self
+
+ new_list = TensorList()
+ for t in self:
+ if isinstance(t, TensorList):
+ new_list.extend(t.unroll())
+ else:
+ new_list.append(t)
+ return new_list
+
+ def attribute(self, attr: str, *args):
+ return TensorList([getattr(e, attr, *args) for e in self])
+
+ def apply(self, fn):
+ return TensorList([fn(e) for e in self])
+
+ def __getattr__(self, name):
+ for e in self:
+ if not hasattr(e, name):
+ raise AttributeError('\'{}\' object has not attribute \'{}\''.
+ format(type(e), name))
+
+ def apply_attr(*args, **kwargs):
+ return TensorList([getattr(e, name)(*args, **kwargs) for e in self])
+
+ return apply_attr
+
+ @staticmethod
+ def _iterable(a):
+ return isinstance(a, (TensorList, list))
+
+
+def tensor_operation(op):
+ def islist(a):
+ return isinstance(a, TensorList)
+
+ @functools.wraps(op)
+ def oplist(*args, **kwargs):
+ if len(args) == 0:
+ raise ValueError(
+ 'Must be at least one argument without keyword (i.e. operand).')
+
+ if len(args) == 1:
+ if islist(args[0]):
+ return TensorList([op(a, **kwargs) for a in args[0]])
+ else:
+ # Multiple operands, assume max two
+ if islist(args[0]) and islist(args[1]):
+ return TensorList(
+ [op(a, b, *args[2:], **kwargs) for a, b in zip(*args[:2])])
+ if islist(args[0]):
+ return TensorList([op(a, *args[1:], **kwargs) for a in args[0]])
+ if islist(args[1]):
+ return TensorList(
+ [op(args[0], b, *args[2:], **kwargs) for b in args[1]])
+
+ # None of the operands are lists
+ return op(*args, **kwargs)
+
+ return oplist
diff --git a/PaddleCV/tracking/pytracking/parameter/atom/__init__.py b/PaddleCV/tracking/pytracking/parameter/atom/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/PaddleCV/tracking/pytracking/parameter/atom/default_vot.py b/PaddleCV/tracking/pytracking/parameter/atom/default_vot.py
new file mode 100644
index 0000000000000000000000000000000000000000..e59e018785f548b1903f02588caf1ae99b300984
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/parameter/atom/default_vot.py
@@ -0,0 +1,114 @@
+import numpy as np
+
+from pytracking.features.deep import ResNet18, ResNet50
+from pytracking.features.extractor import MultiResolutionExtractor
+from pytracking.utils import TrackerParams, FeatureParams
+
+
+def parameters():
+ params = TrackerParams()
+
+ # These are usually set from outside
+ params.debug = 0 # Debug level
+ params.visualization = False # Do visualization
+
+ # Use GPU or not (IoUNet requires this to be True)
+ params.use_gpu = True
+
+ # Feature specific parameters
+ deep_params = TrackerParams()
+
+ # Patch sampling parameters
+ params.max_image_sample_size = (14 * 16)**2 # Maximum image sample size
+ params.min_image_sample_size = (14 * 16)**2 # Minimum image sample size
+ params.search_area_scale = 4 # Scale relative to target size
+ params.feature_size_odd = False # Good to use False for even-sized kernels and vice versa
+
+ # Optimization parameters
+ params.CG_iter = 5 # The number of Conjugate Gradient iterations in each update after the first frame
+ params.init_CG_iter = 60 # The total number of Conjugate Gradient iterations used in the first frame
+ params.init_GN_iter = 6 # The number of Gauss-Newton iterations used in the first frame (only if the projection matrix is updated)
+ params.post_init_CG_iter = 0 # CG iterations to run after GN
+ params.fletcher_reeves = False # Use the Fletcher-Reeves (true) or Polak-Ribiere (false) formula in the Conjugate Gradient
+ params.standard_alpha = True # Use the standard formula for computing the step length in Conjugate Gradient
+ params.CG_forgetting_rate = None # Forgetting rate of the last conjugate direction
+
+ # Learning parameters for each feature type
+ deep_params.learning_rate = 0.0075 # Learning rate
+ deep_params.output_sigma_factor = 1 / 4 # Standard deviation of Gaussian label relative to target size
+
+ # Training parameters
+ params.sample_memory_size = 250 # Memory size
+ params.train_skipping = 10 # How often to run training (every n-th frame)
+
+ # Online model parameters
+ # deep_params.kernel_size = (4, 4) # when slice double grad is support,
+ # else, deep_params.kernel_size = (5, 5)
+ deep_params.kernel_size = (5, 5) # Kernel size of filter
+ deep_params.compressed_dim = 64 # Dimension output of projection matrix
+ deep_params.filter_reg = 1e-1 # Filter regularization factor
+ deep_params.projection_reg = 1e-4 # Projection regularization factor
+
+ # Windowing
+ params.feature_window = False # Perform windowing of features
+ params.window_output = True # Perform windowing of output scores
+
+ # Detection parameters
+ params.scale_factors = np.array(
+ [1], dtype='float32'
+ ) # What scales to use for localization (only one scale if IoUNet is used)
+ params.score_upsample_factor = 1 # How much Fourier upsampling to use
+
+ # Init data augmentation parameters
+ params.augmentation = {
+ 'fliplr': True,
+ 'rotate': [5, -5, 10, -10, 20, -20, 30, -30, 45, -45, -60, 60],
+ 'blur': [(2, 0.2), (0.2, 2), (3, 1), (1, 3), (2, 2)],
+ 'relativeshift': [(0.6, 0.6), (-0.6, 0.6), (0.6, -0.6), (-0.6, -0.6)],
+ 'dropout': (7, 0.2)
+ }
+
+ params.augmentation_expansion_factor = 2 # How much to expand sample when doing augmentation
+ params.random_shift_factor = 1 / 3 # How much random shift to do on each augmented sample
+ deep_params.use_augmentation = True # Whether to use augmentation for this feature
+
+ # Factorized convolution parameters
+ # params.use_projection_matrix = True # Use projection matrix, i.e. use the factorized convolution formulation
+ params.update_projection_matrix = True # Whether the projection matrix should be optimized or not
+ params.proj_init_method = 'randn' # Method for initializing the projection matrix
+ params.filter_init_method = 'randn' # Method for initializing the spatial filter
+ params.projection_activation = 'none' # Activation function after projection ('none', 'relu', 'elu' or 'mlu')
+ params.response_activation = (
+ 'mlu', 0.05
+ ) # Activation function on the output scores ('none', 'relu', 'elu' or 'mlu')
+
+ # Advanced localization parameters
+ params.advanced_localization = True # Use this or not
+ params.target_not_found_threshold = -1 # Absolute score threshold to detect target missing
+ params.distractor_threshold = 100 # Relative threshold to find distractors
+ params.hard_negative_threshold = 0.3 # Relative threshold to find hard negative samples
+ params.target_neighborhood_scale = 2.2 # Target neighborhood to remove
+ params.dispalcement_scale = 0.7 # Dispacement to consider for distractors
+ params.hard_negative_learning_rate = 0.02 # Learning rate if hard negative detected
+ params.hard_negative_CG_iter = 5 # Number of optimization iterations to use if hard negative detected
+ params.update_scale_when_uncertain = True # Update scale or not if distractor is close
+
+ # IoUNet parameters
+ params.iounet_augmentation = False # Use the augmented samples to compute the modulation vector
+ params.iounet_k = 3 # Top-k average to estimate final box
+ params.num_init_random_boxes = 9 # Num extra random boxes in addition to the classifier prediction
+ params.box_jitter_pos = 0.1 # How much to jitter the translation for random boxes
+ params.box_jitter_sz = 0.5 # How much to jitter the scale for random boxes
+ params.maximal_aspect_ratio = 6 # Limit on the aspect ratio
+ params.box_refinement_iter = 5 # Number of iterations for refining the boxes
+ params.box_refinement_step_length = 1 # Gradient step length in the bounding box refinement
+ params.box_refinement_step_decay = 1 # Multiplicative step length decay (1 means no decay)
+
+ # Setup the feature extractor (which includes the IoUNet)
+ deep_fparams = FeatureParams(feature_params=[deep_params])
+ deep_feat = ResNet18(
+ output_layers=['block2'], fparams=deep_fparams, normalize_power=2)
+ params.features = MultiResolutionExtractor([deep_feat])
+
+ params.vot_anno_conversion_type = 'preserve_area'
+ return params
diff --git a/PaddleCV/tracking/pytracking/parameter/siamfc/default.py b/PaddleCV/tracking/pytracking/parameter/siamfc/default.py
new file mode 100644
index 0000000000000000000000000000000000000000..d77c11f9a9e5dd3cd856d7a227be3a2cb126c22d
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/parameter/siamfc/default.py
@@ -0,0 +1,56 @@
+import numpy as np
+
+from pytracking.features import deep
+from pytracking.features.extractor import MultiResolutionExtractor
+from pytracking.utils import TrackerParams, FeatureParams
+
+
+def parameters():
+ params = TrackerParams()
+
+ # These are usually set from outside
+ params.debug = 0 # Debug level
+ params.visualization = False # Do visualization
+
+ # Use GPU or not (IoUNet requires this to be True)
+ params.use_gpu = True
+
+ # Feature specific parameters
+ deep_params = TrackerParams()
+
+ # Patch sampling parameters
+ params.exemplar_size = 127
+ params.max_image_sample_size = 255 * 255 # Maximum image sample size
+ params.min_image_sample_size = 255 * 255 # Minimum image sample size
+
+ # Detection parameters
+ params.scale_factors = 1.0375**np.array(
+ [-1, 0, 1]
+ ) # What scales to use for localization (only one scale if IoUNet is used)
+ params.score_upsample_factor = 16 # How much Fourier upsampling to use
+ params.scale_penalty = 0.9745
+ params.scale_lr = 0.59
+ params.window_influence = 0.176
+ params.total_stride = 8
+
+ # Setup the feature extractor (which includes the IoUNet)
+ deep_fparams = FeatureParams(feature_params=[deep_params])
+ deep_feat = deep.SFCAlexnet(
+ net_path='/ssd2/bily/code/baidu/personal-code/pytracking/ltr/checkpoints/ltr/fs/siamrpn50/SiamRPN_ep0001.pth.tar',
+ output_layers=['conv5'],
+ fparams=deep_fparams)
+ params.features = MultiResolutionExtractor([deep_feat])
+
+ params.net_path = None
+ params.response_up = 16
+ params.response_sz = 17
+ params.context = 0.5
+ params.instance_sz = 255
+ params.exemplar_sz = 127
+ params.scale_num = 3
+ params.scale_step = 1.0375
+ params.scale_lr = 0.59
+ params.scale_penalty = 0.9745
+ params.window_influence = 0.176
+ params.total_stride = 8
+ return params
diff --git a/PaddleCV/tracking/pytracking/tracker/__init__.py b/PaddleCV/tracking/pytracking/tracker/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/PaddleCV/tracking/pytracking/tracker/atom/__init__.py b/PaddleCV/tracking/pytracking/tracker/atom/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..40ad3b507bba6ab768fcae3db0c5d71724b6627d
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/tracker/atom/__init__.py
@@ -0,0 +1,5 @@
+from .atom import ATOM
+
+
+def get_tracker_class():
+ return ATOM
diff --git a/PaddleCV/tracking/pytracking/tracker/atom/atom.py b/PaddleCV/tracking/pytracking/tracker/atom/atom.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a08cff271f76ac38496f51ece2dcab166a0b137
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/tracker/atom/atom.py
@@ -0,0 +1,1021 @@
+import math
+import os
+import time
+
+import numpy as np
+from paddle import fluid
+from paddle.fluid import layers
+
+from pytracking.features import augmentation
+from pytracking.libs import dcf, operation, fourier
+from pytracking.libs.optimization import ConjugateGradient, GaussNewtonCG, GradientDescentL2
+from pytracking.libs.paddle_utils import mod, n2p, \
+ leaky_relu, dropout2d
+from pytracking.libs.tensorlist import TensorList
+from pytracking.tracker.atom.optim import FactorizedConvProblem, ConvProblem
+from pytracking.tracker.base.basetracker import BaseTracker
+
+
+class ATOM(BaseTracker):
+ def initialize_features(self):
+ if not getattr(self, 'features_initialized', False):
+ self.params.features.initialize()
+ self.features_initialized = True
+
+ def initialize(self, image, state, *args, **kwargs):
+ # Initialize some stuff
+ self.frame_num = 1
+ # TODO: for now, we don't support explictly setting up device
+ # if not hasattr(self.params, 'device'):
+ # self.params.device = 'cuda' if self.params.use_gpu else 'cpu'
+
+ # Initialize features
+ self.initialize_features()
+
+ # Check if image is color
+ self.params.features.set_is_color(image.shape[2] == 3)
+
+ # Get feature specific params
+ self.fparams = self.params.features.get_fparams('feature_params')
+
+ self.time = 0
+ tic = time.time()
+
+ # Get position and size
+ self.pos = np.array(
+ [state[1] + (state[3] - 1) / 2, state[0] + (state[2] - 1) / 2],
+ 'float32')
+ self.target_sz = np.array([state[3], state[2]], 'float32')
+
+ # Set search area
+ self.target_scale = 1.0
+ search_area = np.prod(self.target_sz * self.params.search_area_scale)
+ if search_area > self.params.max_image_sample_size:
+ self.target_scale = math.sqrt(search_area /
+ self.params.max_image_sample_size)
+ elif search_area < self.params.min_image_sample_size:
+ self.target_scale = math.sqrt(search_area /
+ self.params.min_image_sample_size)
+
+ # Check if IoUNet is used
+ self.use_iou_net = getattr(self.params, 'use_iou_net', True)
+
+ # Target size in base scale
+ self.base_target_sz = self.target_sz / self.target_scale
+
+ # Use odd square search area and set sizes
+ feat_max_stride = max(self.params.features.stride())
+ if getattr(self.params, 'search_area_shape', 'square') == 'square':
+ self.img_sample_sz = np.ones((2, ), 'float32') * np.round(
+ np.sqrt(
+ np.prod(self.base_target_sz *
+ self.params.search_area_scale)))
+ elif self.params.search_area_shape == 'initrect':
+ self.img_sample_sz = np.round(self.base_target_sz *
+ self.params.search_area_scale)
+ else:
+ raise ValueError('Unknown search area shape')
+ if self.params.feature_size_odd:
+ self.img_sample_sz += feat_max_stride - mod(self.img_sample_sz,
+ (2 * feat_max_stride))
+ else:
+ self.img_sample_sz += feat_max_stride - mod(
+ (self.img_sample_sz + feat_max_stride), (2 * feat_max_stride))
+
+ # Set sizes
+ self.img_support_sz = self.img_sample_sz
+ self.feature_sz = self.params.features.size(self.img_sample_sz)
+ self.output_sz = self.params.score_upsample_factor * self.img_support_sz # Interpolated size of the output
+ self.kernel_size = self.fparams.attribute('kernel_size')
+
+ self.iou_img_sample_sz = self.img_sample_sz
+
+ # Optimization options
+ self.params.precond_learning_rate = self.fparams.attribute(
+ 'learning_rate')
+ if self.params.CG_forgetting_rate is None or max(
+ self.params.precond_learning_rate) >= 1:
+ self.params.direction_forget_factor = 0
+ else:
+ self.params.direction_forget_factor = (
+ 1 - max(self.params.precond_learning_rate)
+ )**self.params.CG_forgetting_rate
+
+ self.output_window = None
+ if getattr(self.params, 'window_output', False):
+ if getattr(self.params, 'use_clipped_window', False):
+ self.output_window = dcf.hann2d_clipped(
+ self.output_sz.astype('long'),
+ self.output_sz.astype('long') *
+ self.params.effective_search_area /
+ self.params.search_area_scale,
+ centered=False)
+ else:
+ self.output_window = dcf.hann2d(
+ self.output_sz.astype('long'), centered=False)
+
+ # Initialize some learning things
+ self.init_learning()
+
+ # Convert image
+ im = image.astype('float32')
+ self.im = im # For debugging only
+
+ # Setup scale bounds
+ self.image_sz = np.array([im.shape[0], im.shape[1]], 'float32')
+ self.min_scale_factor = np.max(10 / self.base_target_sz)
+ self.max_scale_factor = np.min(self.image_sz / self.base_target_sz)
+
+ # Extract and transform sample
+ x = self.generate_init_samples(im)
+
+ # Initialize iounet
+ if self.use_iou_net:
+ self.init_iou_net()
+
+ # Initialize projection matrix
+ self.init_projection_matrix(x)
+
+ # Transform to get the training sample
+ train_x = self.preprocess_sample(x)
+
+ # Generate label function
+ init_y = self.init_label_function(train_x)
+
+ # Init memory
+ self.init_memory(train_x)
+
+ # Init optimizer and do initial optimization
+ self.init_optimization(train_x, init_y)
+
+ self.pos_iounet = self.pos.copy()
+
+ self.time += time.time() - tic
+
+ def track(self, image):
+
+ self.frame_num += 1
+
+ # Convert image
+ # im = numpy_to_paddle(image)
+ im = image.astype('float32')
+ self.im = im # For debugging only
+
+ # ------- LOCALIZATION ------- #
+
+ # Get sample
+ sample_pos = self.pos.round()
+ sample_scales = self.target_scale * self.params.scale_factors
+
+ test_x = self.extract_processed_sample(im, self.pos, sample_scales,
+ self.img_sample_sz)
+
+ # Compute scores
+ scores_raw = self.apply_filter(test_x)
+ translation_vec, scale_ind, s, flag = self.localize_target(scores_raw)
+
+ # Update position and scale
+ if flag != 'not_found':
+ if self.use_iou_net:
+ update_scale_flag = getattr(self.params,
+ 'update_scale_when_uncertain',
+ True) or flag != 'uncertain'
+ if getattr(self.params, 'use_classifier', True):
+ self.update_state(sample_pos + translation_vec)
+ self.refine_target_box(sample_pos, sample_scales[scale_ind],
+ scale_ind, update_scale_flag)
+ elif getattr(self.params, 'use_classifier', True):
+ self.update_state(sample_pos + translation_vec,
+ sample_scales[scale_ind])
+
+ # ------- UPDATE ------- #
+
+ # Check flags and set learning rate if hard negative
+ update_flag = flag not in ['not_found', 'uncertain']
+ hard_negative = (flag == 'hard_negative')
+ learning_rate = self.params.hard_negative_learning_rate if hard_negative else None
+
+ if update_flag:
+ # Get train sample
+ train_x = TensorList([x[scale_ind:scale_ind + 1] for x in test_x])
+
+ # Create label for sample
+ train_y = self.get_label_function(sample_pos,
+ sample_scales[scale_ind])
+
+ # Update memory
+ self.update_memory(train_x, train_y, learning_rate)
+
+ # Train filter
+ if hard_negative:
+ self.filter_optimizer.run(self.params.hard_negative_CG_iter)
+ elif (self.frame_num - 1) % self.params.train_skipping == 0:
+ self.filter_optimizer.run(self.params.CG_iter)
+ self.filter = self.filter_optimizer.x
+
+ # Set the pos of the tracker to iounet pos
+ if self.use_iou_net and flag != 'not_found':
+ self.pos = self.pos_iounet.copy()
+
+ # Return new state
+ yx = self.pos - (self.target_sz - 1) / 2
+ new_state = np.array(
+ [yx[1], yx[0], self.target_sz[1], self.target_sz[0]], 'float32')
+
+ return new_state.tolist()
+
+ def update_memory(self,
+ sample_x: TensorList,
+ sample_y: TensorList,
+ learning_rate=None):
+ replace_ind = self.update_sample_weights(
+ self.sample_weights, self.previous_replace_ind,
+ self.num_stored_samples, self.num_init_samples, self.fparams,
+ learning_rate)
+ self.previous_replace_ind = replace_ind
+ for train_samp, x, ind in zip(self.training_samples, sample_x,
+ replace_ind):
+ train_samp[ind] = x[0]
+ for y_memory, y, ind in zip(self.y, sample_y, replace_ind):
+ y_memory[ind] = y[0]
+ if self.hinge_mask is not None:
+ for m, y, ind in zip(self.hinge_mask, sample_y, replace_ind):
+ m[ind] = layers.cast(y >= self.params.hinge_threshold,
+ 'float32')[0]
+ self.num_stored_samples += 1
+
+ def update_sample_weights(self,
+ sample_weights,
+ previous_replace_ind,
+ num_stored_samples,
+ num_init_samples,
+ fparams,
+ learning_rate=None):
+ # Update weights and get index to replace in memory
+ replace_ind = []
+ for sw, prev_ind, num_samp, num_init, fpar in zip(
+ sample_weights, previous_replace_ind, num_stored_samples,
+ num_init_samples, fparams):
+ lr = learning_rate
+ if lr is None:
+ lr = fpar.learning_rate
+
+ init_samp_weight = getattr(fpar, 'init_samples_minimum_weight',
+ None)
+ if init_samp_weight == 0:
+ init_samp_weight = None
+ s_ind = 0 if init_samp_weight is None else num_init
+
+ if num_samp == 0 or lr == 1:
+ sw[:] = 0
+ sw[0] = 1
+ r_ind = 0
+ else:
+ # Get index to replace
+ r_ind = np.argmin(sw[s_ind:], 0)
+ r_ind = int(r_ind + s_ind)
+
+ # Update weights
+ if prev_ind is None:
+ sw /= 1 - lr
+ sw[r_ind] = lr
+ else:
+ sw[r_ind] = sw[prev_ind] / (1 - lr)
+
+ sw /= sw.sum()
+ if init_samp_weight is not None and sw[:num_init].sum(
+ ) < init_samp_weight:
+ sw /= init_samp_weight + sw[num_init:].sum()
+ sw[:num_init] = init_samp_weight / num_init
+
+ replace_ind.append(r_ind)
+
+ return replace_ind
+
+ def localize_target(self, scores_raw):
+ # Weighted sum (if multiple features) with interpolation in fourier domain
+ weight = self.fparams.attribute('translation_weight', 1.0)
+ scores_raw = weight * scores_raw
+ sf_weighted = fourier.cfft2(scores_raw) / (scores_raw.size(2) *
+ scores_raw.size(3))
+ for i, (sz, ksz) in enumerate(zip(self.feature_sz, self.kernel_size)):
+ sf_weighted[i] = fourier.shift_fs(sf_weighted[i], math.pi * (
+ 1 - np.array([ksz[0] % 2, ksz[1] % 2]) / sz))
+
+ scores_fs = fourier.sum_fs(sf_weighted)
+ scores = fourier.sample_fs(scores_fs, self.output_sz)
+
+ if self.output_window is not None and not getattr(
+ self.params, 'perform_hn_without_windowing', False):
+ scores *= self.output_window
+
+ if getattr(self.params, 'advanced_localization', False):
+ return self.localize_advanced(scores)
+
+ # Get maximum
+ max_score, max_disp = dcf.max2d(scores)
+ scale_ind = np.argmax(max_score, axis=0)[0]
+ max_disp = max_disp.astype('float32')
+
+ # Convert to displacements in the base scale
+ output_sz = self.output_sz.copy()
+ disp = mod((max_disp + output_sz / 2), output_sz) - output_sz / 2
+
+ # Compute translation vector and scale change factor
+ translation_vec = np.reshape(
+ disp[scale_ind].astype('float32'), [-1]) * (
+ self.img_support_sz / self.output_sz) * self.target_scale
+ translation_vec *= self.params.scale_factors[scale_ind]
+
+ # Shift the score output for visualization purposes
+ if self.params.debug >= 2:
+ sz = scores.shape[-2:]
+ scores = np.concatenate(
+ [scores[..., sz[0] // 2:, :], scores[..., :sz[0] // 2, :]], -2)
+ scores = np.concatenate(
+ [scores[..., sz[1] // 2:], scores[..., :sz[1] // 2]], -1)
+
+ return translation_vec, scale_ind, scores, None
+
+ def update_state(self, new_pos, new_scale=None):
+ # Update scale
+ if new_scale is not None:
+ self.target_scale = np.clip(new_scale, self.min_scale_factor,
+ self.max_scale_factor)
+ self.target_sz = self.base_target_sz * self.target_scale
+
+ # Update pos
+ inside_ratio = 0.2
+ inside_offset = (inside_ratio - 0.5) * self.target_sz
+ self.pos = np.maximum(
+ np.minimum(new_pos,
+ self.image_sz.astype('float32') - inside_offset),
+ inside_offset)
+
+ def get_label_function(self, sample_pos, sample_scale):
+ # Generate label function
+ train_y = TensorList()
+ target_center_norm = (self.pos - sample_pos) / (self.img_support_sz *
+ sample_scale)
+ for sig, sz, ksz in zip(self.sigma, self.feature_sz, self.kernel_size):
+ center = sz * target_center_norm + 0.5 * np.array(
+ [(ksz[0] + 1) % 2, (ksz[1] + 1) % 2], 'float32')
+ train_y.append(dcf.label_function_spatial(sz, sig, center))
+ return train_y
+
+ def extract_sample(self,
+ im: np.ndarray,
+ pos: np.ndarray,
+ scales,
+ sz: np.ndarray,
+ debug_save_name):
+ return self.params.features.extract(im, pos, scales, sz,
+ debug_save_name)
+
+ def extract_processed_sample(self,
+ im: np.ndarray,
+ pos: np.ndarray,
+ scales,
+ sz: np.ndarray,
+ debug_save_name=None) -> (TensorList,
+ TensorList):
+ x = self.extract_sample(im, pos, scales, sz, debug_save_name)
+ return self.preprocess_sample(self.project_sample(x))
+
+ def apply_filter(self, sample_x: TensorList):
+ with fluid.dygraph.guard():
+ sample_x = sample_x.apply(n2p)
+ filter = self.filter.apply(n2p)
+ return operation.conv2d(sample_x, filter, mode='same').numpy()
+
+ def init_projection_matrix(self, x):
+ # Set if using projection matrix
+ self.params.use_projection_matrix = getattr(
+ self.params, 'use_projection_matrix', True)
+
+ if self.params.use_projection_matrix:
+ self.compressed_dim = self.fparams.attribute('compressed_dim', None)
+
+ proj_init_method = getattr(self.params, 'proj_init_method', 'pca')
+ if proj_init_method == 'pca':
+ raise NotImplementedError
+ elif proj_init_method == 'randn':
+ with fluid.dygraph.guard():
+ self.projection_matrix = TensorList([
+ None if cdim is None else layers.gaussian_random(
+ (cdim, ex.shape[1], 1, 1), 0.0,
+ 1 / math.sqrt(ex.shape[1])).numpy()
+ for ex, cdim in zip(x, self.compressed_dim)
+ ])
+ elif proj_init_method == 'np_randn':
+ rng = np.random.RandomState(0)
+ self.projection_matrix = TensorList([
+ None if cdim is None else rng.normal(
+ size=(cdim, ex.shape[1], 1, 1),
+ loc=0.0,
+ scale=1 / math.sqrt(ex.shape[1])).astype('float32')
+ for ex, cdim in zip(x, self.compressed_dim)
+ ])
+ elif proj_init_method == 'ones':
+ self.projection_matrix = TensorList([
+ None if cdim is None else
+ np.ones((cdim, ex.shape[1], 1, 1),
+ 'float32') / math.sqrt(ex.shape[1])
+ for ex, cdim in zip(x, self.compressed_dim)
+ ])
+ else:
+ self.compressed_dim = x.size(1)
+ self.projection_matrix = TensorList([None] * len(x))
+
+ def preprocess_sample(self, x: TensorList) -> (TensorList, TensorList):
+ if getattr(self.params, '_feature_window', False):
+ x = x * self.feature_window
+ return x
+
+ def init_label_function(self, train_x):
+ # Allocate label function
+ self.y = TensorList([
+ np.zeros(
+ [self.params.sample_memory_size, 1, x.shape[2], x.shape[3]],
+ 'float32') for x in train_x
+ ])
+
+ # Output sigma factor
+ output_sigma_factor = self.fparams.attribute('output_sigma_factor')
+ self.sigma = output_sigma_factor * np.ones((2, ), 'float32') * (
+ self.feature_sz / self.img_support_sz *
+ self.base_target_sz).apply(np.prod).apply(np.sqrt)
+
+ # Center pos in normalized coords
+ target_center_norm = (self.pos - np.round(self.pos)) / (
+ self.target_scale * self.img_support_sz)
+
+ # Generate label functions
+ for y, sig, sz, ksz, x in zip(self.y, self.sigma, self.feature_sz,
+ self.kernel_size, train_x):
+ center_pos = sz * target_center_norm + 0.5 * np.array(
+ [(ksz[0] + 1) % 2, (ksz[1] + 1) % 2], 'float32')
+ for i, T in enumerate(self.transforms[:x.shape[0]]):
+ sample_center = center_pos + np.array(
+ T.shift, 'float32') / self.img_support_sz * sz
+ y[i] = dcf.label_function_spatial(sz, sig, sample_center)
+
+ # Return only the ones to use for initial training
+ return TensorList([y[:x.shape[0]] for y, x in zip(self.y, train_x)])
+
+ def init_memory(self, train_x):
+ # Initialize first-frame training samples
+ self.num_init_samples = train_x.size(0)
+ self.init_sample_weights = TensorList(
+ [np.ones(x.shape[0], 'float32') / x.shape[0] for x in train_x])
+ self.init_training_samples = train_x
+
+ # Sample counters and weights
+ self.num_stored_samples = self.num_init_samples.copy()
+ self.previous_replace_ind = [None] * len(self.num_stored_samples)
+ self.sample_weights = TensorList([
+ np.zeros(self.params.sample_memory_size, 'float32') for x in train_x
+ ])
+ for sw, init_sw, num in zip(self.sample_weights,
+ self.init_sample_weights,
+ self.num_init_samples):
+ sw[:num] = init_sw
+
+ # Initialize memory
+ self.training_samples = TensorList(
+ [[np.zeros([cdim, x.shape[2], x.shape[3]], 'float32')] *
+ self.params.sample_memory_size
+ for x, cdim in zip(train_x, self.compressed_dim)])
+
+ def init_learning(self):
+ # Get window function
+ self.feature_window = TensorList(
+ [dcf.hann2d(sz) for sz in self.feature_sz])
+
+ # Filter regularization
+ self.filter_reg = self.fparams.attribute('filter_reg')
+
+ # Activation function after the projection matrix (phi_1 in the paper)
+ projection_activation = getattr(self.params, 'projection_activation',
+ 'none')
+ if isinstance(projection_activation, tuple):
+ projection_activation, act_param = projection_activation
+
+ if projection_activation == 'none':
+ self.projection_activation = lambda x: x
+ elif projection_activation == 'relu':
+ self.projection_activation = layers.relu
+ elif projection_activation == 'elu':
+ self.projection_activation = layers.elu
+ elif projection_activation == 'mlu':
+ self.projection_activation = lambda x: layers.elu(leaky_relu(x, 1 / act_param), act_param)
+ else:
+ raise ValueError('Unknown activation')
+
+ # Activation function after the output scores (phi_2 in the paper)
+ response_activation = getattr(self.params, 'response_activation',
+ 'none')
+ if isinstance(response_activation, tuple):
+ response_activation, act_param = response_activation
+
+ if response_activation == 'none':
+ self.response_activation = lambda x: x
+ elif response_activation == 'relu':
+ self.response_activation = layers.relu
+ elif response_activation == 'elu':
+ self.response_activation = layers.elu
+ elif response_activation == 'mlu':
+ self.response_activation = lambda x: layers.elu(leaky_relu(x, 1 / act_param), act_param)
+ else:
+ raise ValueError('Unknown activation')
+
+ def generate_init_samples(self, im: np.ndarray) -> TensorList:
+ """Generate augmented initial samples."""
+
+ # Compute augmentation size
+ aug_expansion_factor = getattr(self.params,
+ 'augmentation_expansion_factor', None)
+ aug_expansion_sz = self.img_sample_sz.copy()
+ aug_output_sz = None
+ if aug_expansion_factor is not None and aug_expansion_factor != 1:
+ aug_expansion_sz = (self.img_sample_sz *
+ aug_expansion_factor).astype('long')
+ aug_expansion_sz += (
+ aug_expansion_sz - self.img_sample_sz.astype('long')) % 2
+ aug_expansion_sz = aug_expansion_sz.astype('float32')
+ aug_output_sz = self.img_sample_sz.astype('long').tolist()
+
+ # Random shift operator
+ get_rand_shift = lambda: None
+ random_shift_factor = getattr(self.params, 'random_shift_factor', 0)
+ if random_shift_factor > 0:
+ get_rand_shift = lambda: ((np.random.uniform(size=[2]) - 0.5) * self.img_sample_sz * random_shift_factor).astype('long').tolist()
+
+ # Create transofmations
+ self.transforms = [augmentation.Identity(aug_output_sz)]
+ if 'shift' in self.params.augmentation:
+ self.transforms.extend([
+ augmentation.Translation(shift, aug_output_sz)
+ for shift in self.params.augmentation['shift']
+ ])
+ if 'relativeshift' in self.params.augmentation:
+ get_absolute = lambda shift: (np.array(shift, 'float32') * self.img_sample_sz / 2).astype('long').tolist()
+ self.transforms.extend([
+ augmentation.Translation(get_absolute(shift), aug_output_sz)
+ for shift in self.params.augmentation['relativeshift']
+ ])
+ if 'fliplr' in self.params.augmentation and self.params.augmentation[
+ 'fliplr']:
+ self.transforms.append(
+ augmentation.FlipHorizontal(aug_output_sz, get_rand_shift()))
+ if 'blur' in self.params.augmentation:
+ self.transforms.extend([
+ augmentation.Blur(sigma, aug_output_sz, get_rand_shift())
+ for sigma in self.params.augmentation['blur']
+ ])
+ if 'scale' in self.params.augmentation:
+ self.transforms.extend([
+ augmentation.Scale(scale_factor, aug_output_sz,
+ get_rand_shift())
+ for scale_factor in self.params.augmentation['scale']
+ ])
+ if 'rotate' in self.params.augmentation:
+ self.transforms.extend([
+ augmentation.Rotate(angle, aug_output_sz, get_rand_shift())
+ for angle in self.params.augmentation['rotate']
+ ])
+
+ # Generate initial samples
+ init_samples = self.params.features.extract_transformed(
+ im, self.pos, self.target_scale, aug_expansion_sz, self.transforms)
+
+ # Remove augmented samples for those that shall not have
+ for i, use_aug in enumerate(self.fparams.attribute('use_augmentation')):
+ if not use_aug:
+ init_samples[i] = init_samples[i][0:1]
+
+ # Add dropout samples
+ if 'dropout' in self.params.augmentation:
+ num, prob = self.params.augmentation['dropout']
+ self.transforms.extend(self.transforms[:1] * num)
+ with fluid.dygraph.guard():
+ for i, use_aug in enumerate(
+ self.fparams.attribute('use_augmentation')):
+ if use_aug:
+ init_samples[i] = np.concatenate([
+ init_samples[i], dropout2d(
+ layers.expand(
+ n2p(init_samples[i][0:1]), (num, 1, 1, 1)),
+ prob,
+ is_train=True).numpy()
+ ])
+
+ return init_samples
+
+ def init_optimization(self, train_x, init_y):
+ # Initialize filter
+ filter_init_method = getattr(self.params, 'filter_init_method', 'zeros')
+ self.filter = TensorList([
+ np.zeros([1, cdim, sz[0], sz[1]], 'float32')
+ for x, cdim, sz in zip(train_x, self.compressed_dim,
+ self.kernel_size)
+ ])
+ if filter_init_method == 'zeros':
+ pass
+ elif filter_init_method == 'ones':
+ for idx, f in enumerate(self.filter):
+ self.filter[idx] = np.ones(f.shape,
+ 'float32') / np.prod(f.shape)
+ elif filter_init_method == 'np_randn':
+ rng = np.random.RandomState(0)
+ for idx, f in enumerate(self.filter):
+ self.filter[idx] = rng.normal(
+ size=f.shape, loc=0,
+ scale=1 / np.prod(f.shape)).astype('float32')
+ elif filter_init_method == 'randn':
+ for idx, f in enumerate(self.filter):
+ with fluid.dygraph.guard():
+ self.filter[idx] = layers.gaussian_random(
+ f.shape, std=1 / np.prod(f.shape)).numpy()
+ else:
+ raise ValueError('Unknown "filter_init_method"')
+
+ # Get parameters
+ self.params.update_projection_matrix = getattr(
+ self.params, 'update_projection_matrix',
+ True) and self.params.use_projection_matrix
+ optimizer = getattr(self.params, 'optimizer', 'GaussNewtonCG')
+
+ # Setup factorized joint optimization
+ if self.params.update_projection_matrix:
+ self.joint_problem = FactorizedConvProblem(
+ self.init_training_samples, init_y, self.filter_reg,
+ self.fparams.attribute('projection_reg'), self.params,
+ self.init_sample_weights, self.projection_activation,
+ self.response_activation)
+
+ # Variable containing both filter and projection matrix
+ joint_var = self.filter.concat(self.projection_matrix)
+
+ # Initialize optimizer
+ analyze_convergence = getattr(self.params, 'analyze_convergence',
+ False)
+ if optimizer == 'GaussNewtonCG':
+ self.joint_optimizer = GaussNewtonCG(
+ self.joint_problem,
+ joint_var,
+ plotting=(self.params.debug >= 3),
+ analyze=True,
+ fig_num=(12, 13, 14))
+ elif optimizer == 'GradientDescentL2':
+ self.joint_optimizer = GradientDescentL2(
+ self.joint_problem,
+ joint_var,
+ self.params.optimizer_step_length,
+ self.params.optimizer_momentum,
+ plotting=(self.params.debug >= 3),
+ debug=analyze_convergence,
+ fig_num=(12, 13))
+
+ # Do joint optimization
+ if isinstance(self.params.init_CG_iter, (list, tuple)):
+ self.joint_optimizer.run(self.params.init_CG_iter)
+ else:
+ self.joint_optimizer.run(self.params.init_CG_iter //
+ self.params.init_GN_iter,
+ self.params.init_GN_iter)
+
+ # Get back filter and optimizer
+ len_x = len(self.joint_optimizer.x)
+ self.filter = self.joint_optimizer.x[:len_x // 2] # w2 in paper
+ self.projection_matrix = self.joint_optimizer.x[len_x //
+ 2:] # w1 in paper
+
+ if analyze_convergence:
+ opt_name = 'CG' if getattr(self.params, 'CG_optimizer',
+ True) else 'GD'
+ for val_name, values in zip(['loss', 'gradient'], [
+ self.joint_optimizer.losses,
+ self.joint_optimizer.gradient_mags
+ ]):
+ val_str = ' '.join(
+ ['{:.8e}'.format(v.item()) for v in values])
+ file_name = '{}_{}.txt'.format(opt_name, val_name)
+ with open(file_name, 'a') as f:
+ f.write(val_str + '\n')
+ raise RuntimeError('Exiting')
+
+ # Re-project samples with the new projection matrix
+ compressed_samples = self.project_sample(self.init_training_samples,
+ self.projection_matrix)
+ for train_samp, init_samp in zip(self.training_samples,
+ compressed_samples):
+ for idx in range(init_samp.shape[0]):
+ train_samp[idx] = init_samp[idx]
+
+ self.hinge_mask = None
+
+ # Initialize optimizer
+ self.conv_problem = ConvProblem(self.training_samples, self.y,
+ self.filter_reg, self.sample_weights,
+ self.response_activation)
+
+ if optimizer == 'GaussNewtonCG':
+ self.filter_optimizer = ConjugateGradient(
+ self.conv_problem,
+ self.filter,
+ fletcher_reeves=self.params.fletcher_reeves,
+ direction_forget_factor=self.params.direction_forget_factor,
+ debug=(self.params.debug >= 3),
+ fig_num=(12, 13))
+ elif optimizer == 'GradientDescentL2':
+ self.filter_optimizer = GradientDescentL2(
+ self.conv_problem,
+ self.filter,
+ self.params.optimizer_step_length,
+ self.params.optimizer_momentum,
+ debug=(self.params.debug >= 3),
+ fig_num=12)
+
+ # Transfer losses from previous optimization
+ if self.params.update_projection_matrix:
+ self.filter_optimizer.residuals = self.joint_optimizer.residuals
+ self.filter_optimizer.losses = self.joint_optimizer.losses
+
+ if not self.params.update_projection_matrix:
+ self.filter_optimizer.run(self.params.init_CG_iter)
+
+ # Post optimization
+ self.filter_optimizer.run(self.params.post_init_CG_iter)
+ self.filter = self.filter_optimizer.x
+
+ # Free memory
+ del self.init_training_samples
+ if self.params.use_projection_matrix:
+ del self.joint_problem, self.joint_optimizer
+
+ def project_sample(self, x: TensorList, proj_matrix=None):
+ # Apply projection matrix
+ if proj_matrix is None:
+ proj_matrix = self.projection_matrix
+ with fluid.dygraph.guard():
+ return operation.conv2d(x.apply(n2p), proj_matrix.apply(n2p)).apply(
+ self.projection_activation).numpy()
+
+ def get_iounet_box(self, pos, sz, sample_pos, sample_scale):
+ """All inputs in original image coordinates"""
+ box_center = (pos - sample_pos) / sample_scale + (self.iou_img_sample_sz
+ - 1) / 2
+ box_sz = sz / sample_scale
+ target_ul = box_center - (box_sz - 1) / 2
+ return np.concatenate([np.flip(target_ul, 0), np.flip(box_sz, 0)])
+
+ def get_iou_features(self):
+ return self.params.features.get_unique_attribute('iounet_features')
+
+ def get_iou_backbone_features(self):
+ return self.params.features.get_unique_attribute(
+ 'iounet_backbone_features')
+
+ def init_iou_net(self):
+ # Setup IoU net
+ self.iou_predictor = self.params.features.get_unique_attribute(
+ 'iou_predictor')
+
+ # Get target boxes for the different augmentations
+ self.iou_target_box = self.get_iounet_box(self.pos, self.target_sz,
+ self.pos.round(),
+ self.target_scale)
+ target_boxes = TensorList()
+ if self.params.iounet_augmentation:
+ for T in self.transforms:
+ if not isinstance(
+ T, (augmentation.Identity, augmentation.Translation,
+ augmentation.FlipHorizontal,
+ augmentation.FlipVertical, augmentation.Blur)):
+ break
+ target_boxes.append(self.iou_target_box + np.array(
+ [T.shift[1], T.shift[0], 0, 0]))
+ else:
+ target_boxes.append(self.iou_target_box.copy())
+ target_boxes = np.concatenate(target_boxes.view(1, 4), 0)
+
+ # Get iou features
+ iou_backbone_features = self.get_iou_backbone_features()
+
+ # Remove other augmentations such as rotation
+ iou_backbone_features = TensorList(
+ [x[:target_boxes.shape[0], ...] for x in iou_backbone_features])
+
+ # Extract target feat
+ with fluid.dygraph.guard():
+ iou_backbone_features = iou_backbone_features.apply(n2p)
+ target_boxes = n2p(target_boxes)
+ target_feat = self.iou_predictor.get_filter(iou_backbone_features,
+ target_boxes)
+ self.target_feat = TensorList(
+ [layers.reduce_mean(x, 0).numpy() for x in target_feat])
+
+ if getattr(self.params, 'iounet_not_use_reference', False):
+ self.target_feat = TensorList([
+ np.full_like(tf, tf.norm() / tf.numel())
+ for tf in self.target_feat
+ ])
+
+ def optimize_boxes(self, iou_features, init_boxes):
+ with fluid.dygraph.guard():
+ # Optimize iounet boxes
+ init_boxes = np.reshape(init_boxes, (1, -1, 4))
+ step_length = self.params.box_refinement_step_length
+
+ target_feat = self.target_feat.apply(n2p)
+ iou_features = iou_features.apply(n2p)
+ output_boxes = n2p(init_boxes)
+
+ for f in iou_features:
+ f.stop_gradient = False
+ for i_ in range(self.params.box_refinement_iter):
+ # forward pass
+ bb_init = output_boxes
+ bb_init.stop_gradient = False
+
+ outputs = self.iou_predictor.predict_iou(target_feat,
+ iou_features, bb_init)
+
+ if isinstance(outputs, (list, tuple)):
+ outputs = outputs[0]
+
+ outputs.backward()
+
+ # Update proposal
+ bb_init_np = bb_init.numpy()
+ bb_init_gd = bb_init.gradient()
+ output_boxes = bb_init_np + step_length * bb_init_gd * np.tile(
+ bb_init_np[:, :, 2:], (1, 1, 2))
+ output_boxes = n2p(output_boxes)
+ step_length *= self.params.box_refinement_step_decay
+
+ return layers.reshape(output_boxes, (
+ -1, 4)).numpy(), layers.reshape(outputs, (-1, )).numpy()
+
+ def refine_target_box(self,
+ sample_pos,
+ sample_scale,
+ scale_ind,
+ update_scale=True):
+ # Initial box for refinement
+ init_box = self.get_iounet_box(self.pos, self.target_sz, sample_pos,
+ sample_scale)
+
+ # Extract features from the relevant scale
+ iou_features = self.get_iou_features()
+ iou_features = TensorList(
+ [x[scale_ind:scale_ind + 1, ...] for x in iou_features])
+
+ init_boxes = np.reshape(init_box, (1, 4)).copy()
+
+ rand_fn = lambda a, b: np.random.rand(a, b).astype('float32')
+
+ if self.params.num_init_random_boxes > 0:
+ # Get random initial boxes
+ square_box_sz = np.sqrt(init_box[2:].prod())
+ rand_factor = square_box_sz * np.concatenate([
+ self.params.box_jitter_pos * np.ones(2),
+ self.params.box_jitter_sz * np.ones(2)
+ ])
+ minimal_edge_size = init_box[2:].min() / 3
+ rand_bb = (rand_fn(self.params.num_init_random_boxes, 4) - 0.5
+ ) * rand_factor
+ new_sz = np.clip(init_box[2:] + rand_bb[:, 2:], minimal_edge_size,
+ 1e10)
+ new_center = (init_box[:2] + init_box[2:] / 2) + rand_bb[:, :2]
+ init_boxes = np.concatenate([new_center - new_sz / 2, new_sz], 1)
+ init_boxes = np.concatenate(
+ [np.reshape(init_box, (1, 4)), init_boxes])
+
+ # Refine boxes by maximizing iou
+ output_boxes, output_iou = self.optimize_boxes(iou_features, init_boxes)
+
+ # Remove weird boxes with extreme aspect ratios
+ output_boxes[:, 2:] = np.clip(output_boxes[:, 2:], 1, 1e10)
+ aspect_ratio = output_boxes[:, 2] / output_boxes[:, 3]
+ keep_ind = (aspect_ratio < self.params.maximal_aspect_ratio) * \
+ (aspect_ratio > 1 / self.params.maximal_aspect_ratio)
+ output_boxes = output_boxes[keep_ind, :]
+ output_iou = output_iou[keep_ind]
+
+ # If no box found
+ if output_boxes.shape[0] == 0:
+ return
+
+ # Take average of top k boxes
+ k = getattr(self.params, 'iounet_k', 5)
+ topk = min(k, output_boxes.shape[0])
+ inds = np.argsort(-output_iou)[:topk]
+ predicted_box = np.mean(output_boxes[inds, :], axis=0)
+ predicted_iou = np.mean(
+ np.reshape(output_iou, (-1, 1))[inds, :], axis=0)
+
+ # Update position
+ new_pos = predicted_box[:2] + predicted_box[2:] / 2 - (
+ self.iou_img_sample_sz - 1) / 2
+ new_pos = np.flip(new_pos, 0) * sample_scale + sample_pos
+ new_target_sz = np.flip(predicted_box[2:], 0) * sample_scale
+ new_scale = np.sqrt(
+ np.prod(new_target_sz) / np.prod(self.base_target_sz))
+
+ self.pos_iounet = new_pos.copy()
+
+ if getattr(self.params, 'use_iounet_pos_for_learning', True):
+ self.pos = new_pos.copy()
+
+ self.target_sz = new_target_sz
+
+ if update_scale:
+ self.target_scale = new_scale
+
+ def localize_advanced(self, scores):
+ """Does the advanced localization with hard negative detection and target not found."""
+
+ sz = scores.shape[-2:]
+
+ if self.output_window is not None and getattr(
+ self.params, 'perform_hn_without_windowing', False):
+ scores_orig = scores.copy()
+
+ scores_orig = np.concatenate([
+ scores_orig[..., (sz[0] + 1) // 2:, :],
+ scores_orig[..., :(sz[0] + 1) // 2, :]
+ ], -2)
+ scores_orig = np.concatenate([
+ scores_orig[..., :, (sz[1] + 1) // 2:],
+ scores_orig[..., :, :(sz[1] + 1) // 2]
+ ], -1)
+
+ scores *= self.output_window
+
+ # Shift scores back
+ scores = np.concatenate([
+ scores[..., (sz[0] + 1) // 2:, :], scores[..., :(sz[0] + 1) // 2, :]
+ ], -2)
+ scores = np.concatenate([
+ scores[..., :, (sz[1] + 1) // 2:], scores[..., :, :(sz[1] + 1) // 2]
+ ], -1)
+
+ # Find maximum
+ max_score1, max_disp1 = dcf.max2d(scores)
+ scale_ind = np.argmax(max_score1, axis=0)[0]
+ max_score1 = max_score1[scale_ind]
+ max_disp1 = np.reshape(max_disp1[scale_ind].astype('float32'), (-1))
+
+ target_disp1 = max_disp1 - self.output_sz // 2
+ translation_vec1 = target_disp1 * (self.img_support_sz /
+ self.output_sz) * self.target_scale
+
+ if max_score1 < self.params.target_not_found_threshold:
+ return translation_vec1, scale_ind, scores, 'not_found'
+
+ if self.output_window is not None and getattr(
+ self.params, 'perform_hn_without_windowing', False):
+ scores = scores_orig
+
+ # Mask out target neighborhood
+ target_neigh_sz = self.params.target_neighborhood_scale * self.target_sz / self.target_scale
+ tneigh_top = int(max(round(max_disp1[0] - target_neigh_sz[0] / 2), 0))
+ tneigh_bottom = int(
+ min(round(max_disp1[0] + target_neigh_sz[0] / 2 + 1), sz[0]))
+ tneigh_left = int(max(round(max_disp1[1] - target_neigh_sz[1] / 2), 0))
+ tneigh_right = int(
+ min(round(max_disp1[1] + target_neigh_sz[1] / 2 + 1), sz[1]))
+ scores_masked = scores[scale_ind:scale_ind + 1, ...].copy()
+ scores_masked[..., tneigh_top:tneigh_bottom, tneigh_left:
+ tneigh_right] = 0
+
+ # Find new maximum
+ max_score2, max_disp2 = dcf.max2d(scores_masked)
+ max_disp2 = np.reshape(max_disp2.astype('float32'), (-1))
+ target_disp2 = max_disp2 - self.output_sz // 2
+ translation_vec2 = target_disp2 * (self.img_support_sz /
+ self.output_sz) * self.target_scale
+
+ # Handle the different cases
+ if max_score2 > self.params.distractor_threshold * max_score1:
+ disp_norm1 = np.sqrt(np.sum(target_disp1**2))
+ disp_norm2 = np.sqrt(np.sum(target_disp2**2))
+ disp_threshold = self.params.dispalcement_scale * math.sqrt(
+ sz[0] * sz[1]) / 2
+
+ if disp_norm2 > disp_threshold and disp_norm1 < disp_threshold:
+ return translation_vec1, scale_ind, scores, 'hard_negative'
+ if disp_norm2 < disp_threshold and disp_norm1 > disp_threshold:
+ return translation_vec2, scale_ind, scores, 'hard_negative'
+ if disp_norm2 > disp_threshold and disp_norm1 > disp_threshold:
+ return translation_vec1, scale_ind, scores, 'uncertain'
+
+ # If also the distractor is close, return with highest score
+ return translation_vec1, scale_ind, scores, 'uncertain'
+
+ if max_score2 > self.params.hard_negative_threshold * max_score1 and max_score2 > self.params.target_not_found_threshold:
+ return translation_vec1, scale_ind, scores, 'hard_negative'
+
+ return translation_vec1, scale_ind, scores, None
diff --git a/PaddleCV/tracking/pytracking/tracker/atom/optim.py b/PaddleCV/tracking/pytracking/tracker/atom/optim.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c25c43d6123187baf9ffbffef5b75f2e016a4dc
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/tracker/atom/optim.py
@@ -0,0 +1,243 @@
+import numpy as np
+from paddle.fluid import layers
+from paddle import fluid
+
+from pytracking.libs import optimization, TensorList, operation
+from pytracking.libs.paddle_utils import PTensor, broadcast_op, n2p, static_identity
+import math
+
+
+def stack_input(e):
+ if isinstance(e, list):
+ e_exist = []
+ for x in e:
+ if x is not None:
+ e_exist.append(x)
+ e = np.stack(e_exist)
+ else:
+ assert isinstance(e, np.ndarray)
+ if len(e.shape) == 1:
+ e = np.expand_dims(e, 1)
+ return e
+
+
+class FactorizedConvProblem(optimization.L2Problem):
+ def __init__(self,
+ training_samples: TensorList,
+ y: TensorList,
+ filter_reg: TensorList,
+ projection_reg,
+ params,
+ sample_weights: TensorList,
+ projection_activation,
+ response_activation):
+ self.training_samples = training_samples
+ self.y = y
+ self.filter_reg = filter_reg
+ self.sample_weights = sample_weights
+ self.params = params
+ self.projection_reg = projection_reg
+ self.projection_activation = projection_activation
+ self.response_activation = response_activation
+
+ self.diag_M = self.filter_reg.concat(projection_reg)
+
+ self.inputs_dict = {}
+ # stack tensors
+ self.training_samples_stack = None
+ self.y_stack = None
+ self.sample_weights_stack = None
+
+ def get_inputs(self, scope=''):
+ if scope not in self.inputs_dict:
+ training_samples_p = TensorList([
+ fluid.layers.data(
+ '{}training_samples_{}'.format(scope, idx),
+ shape=[None] + list(v[0].shape),
+ stop_gradient=False,
+ append_batch_size=False)
+ for idx, v in enumerate(self.training_samples)
+ ])
+ y_p = TensorList([
+ fluid.layers.data(
+ '{}y_{}'.format(scope, idx),
+ shape=[None] + list(v[0].shape),
+ stop_gradient=False,
+ append_batch_size=False) for idx, v in enumerate(self.y)
+ ])
+ sample_weights_p = TensorList([
+ fluid.layers.data(
+ '{}sample_weights_{}'.format(scope, idx),
+ shape=[None, 1],
+ stop_gradient=False,
+ append_batch_size=False)
+ for idx, v in enumerate(self.sample_weights)
+ ])
+ self.inputs_dict[scope] = (training_samples_p, y_p,
+ sample_weights_p)
+
+ return self.inputs_dict[scope]
+
+ def get_feed_dict(self, scope=''):
+ if self.training_samples_stack is None or self.y_stack is None or self.sample_weights_stack is None:
+ self.training_samples_stack = self.training_samples.apply(
+ stack_input)
+ self.y_stack = self.y.apply(stack_input)
+ self.sample_weights_stack = self.sample_weights.apply(stack_input)
+ feed_dict = {}
+ for idx, v in enumerate(self.training_samples_stack):
+ feed_dict['{}training_samples_{}'.format(scope, idx)] = v
+ for idx, v in enumerate(self.y_stack):
+ feed_dict['{}y_{}'.format(scope, idx)] = v
+ for idx, v in enumerate(self.sample_weights_stack):
+ feed_dict['{}sample_weights_{}'.format(scope, idx)] = v
+ return feed_dict
+
+ def __call__(self, x: TensorList, scope=''):
+ """
+ Compute residuals
+ :param x: [filters, projection_matrices]
+ :return: [data_terms, filter_regularizations, proj_mat_regularizations]
+ """
+ training_samples, y, samples_weights = self.get_inputs(scope)
+
+ filter = x[:len(x) // 2] # w2 in paper
+ P = x[len(x) // 2:] # w1 in paper
+
+ # Do first convolution
+ compressed_samples = operation.conv1x1(
+ training_samples, P).apply(self.projection_activation)
+
+ # Do second convolution
+ residuals = operation.conv2d(
+ compressed_samples, filter,
+ mode='same').apply(self.response_activation)
+
+ # Compute data residuals
+ residuals = residuals - y
+
+ residuals = residuals * samples_weights.sqrt()
+
+ # Add regularization for projection matrix
+ # TODO: remove static_identity
+ # for now, this is needed. Otherwise the gradient is None
+ residuals.extend(
+ filter.apply(static_identity) * self.filter_reg.apply(math.sqrt))
+
+ # Add regularization for projection matrix
+ residuals.extend(
+ P.apply(static_identity) * self.projection_reg.apply(math.sqrt))
+
+ return residuals
+
+ def ip_input(self, a: TensorList, b: TensorList):
+ # return a.reshape(-1) @ b.reshape(-1)
+ num = len(a) // 2 # Number of filters
+ a_filter = a[:num]
+ b_filter = b[:num]
+ a_P = a[num:]
+ b_P = b[num:]
+
+ # Filter inner product
+ ip_out = a_filter.reshape(-1) @b_filter.reshape(-1)
+ # ip_out = operation.conv2d(a_filter, b_filter).view(-1)
+
+ # Add projection matrix part
+ ip_out += a_P.reshape(-1) @b_P.reshape(-1)
+ # ip_out += operation.conv2d(a_P.view(1, -1, 1, 1), b_P.view(1, -1, 1, 1)).view(-1)
+
+ # Have independent inner products for each filter
+ return ip_out.concat(ip_out.clone())
+
+ def M1(self, x: TensorList):
+ return x / self.diag_M
+
+
+class ConvProblem(optimization.L2Problem):
+ def __init__(self,
+ training_samples: TensorList,
+ y: TensorList,
+ filter_reg: TensorList,
+ sample_weights: TensorList,
+ response_activation):
+ self.training_samples = training_samples
+ self.y = y
+ self.filter_reg = filter_reg
+ self.sample_weights = sample_weights
+ self.response_activation = response_activation
+
+ self.inputs_dict = {}
+ # stack tensors
+ self.training_samples_stack = None
+ self.y_stack = None
+ self.sample_weights_stack = None
+
+ def get_feed_dict(self, scope=''):
+ if self.training_samples_stack is None or self.y_stack is None or self.sample_weights_stack is None:
+ self.training_samples_stack = self.training_samples.apply(
+ stack_input)
+ self.y_stack = self.y.apply(stack_input)
+ self.sample_weights_stack = self.sample_weights.apply(stack_input)
+ feed_dict = {}
+ for idx, v in enumerate(self.training_samples_stack):
+ feed_dict['{}training_samples_{}'.format(scope, idx)] = v
+ for idx, v in enumerate(self.y_stack):
+ feed_dict['{}y_{}'.format(scope, idx)] = v
+ for idx, v in enumerate(self.sample_weights_stack):
+ feed_dict['{}sample_weights_{}'.format(scope, idx)] = v
+ return feed_dict
+
+ def get_inputs(self, scope=''):
+ if scope not in self.inputs_dict:
+ training_samples_p = TensorList([
+ fluid.layers.data(
+ '{}training_samples_{}'.format(scope, idx),
+ shape=[None] + list(v[0].shape),
+ stop_gradient=False,
+ append_batch_size=False)
+ for idx, v in enumerate(self.training_samples)
+ ])
+ y_p = TensorList([
+ fluid.layers.data(
+ '{}y_{}'.format(scope, idx),
+ shape=[None] + list(v[0].shape),
+ stop_gradient=False,
+ append_batch_size=False) for idx, v in enumerate(self.y)
+ ])
+ sample_weights_p = TensorList([
+ fluid.layers.data(
+ '{}sample_weights_{}'.format(scope, idx),
+ shape=[None] + list(v[0].shape),
+ stop_gradient=False,
+ append_batch_size=False)
+ for idx, v in enumerate(self.sample_weights)
+ ])
+ self.inputs_dict[scope] = (training_samples_p, y_p,
+ sample_weights_p)
+
+ return self.inputs_dict[scope]
+
+ def __call__(self, x: TensorList, scope=''):
+ """
+ Compute residuals
+ :param x: [filters]
+ :return: [data_terms, filter_regularizations]
+ """
+ training_samples, y, samples_weights = self.get_inputs(scope)
+ # Do convolution and compute residuals
+ residuals = operation.conv2d(
+ training_samples, x, mode='same').apply(self.response_activation)
+ residuals = residuals - y
+
+ residuals = residuals * samples_weights.sqrt()
+
+ # Add regularization for projection matrix
+ residuals.extend(
+ x.apply(static_identity) * self.filter_reg.apply(math.sqrt))
+
+ return residuals
+
+ def ip_input(self, a: TensorList, b: TensorList):
+ return a.reshape(-1) @b.reshape(-1)
+ # return (a * b).sum()
+ # return operation.conv2d(a, b).view(-1)
diff --git a/PaddleCV/tracking/pytracking/tracker/base/__init__.py b/PaddleCV/tracking/pytracking/tracker/base/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/PaddleCV/tracking/pytracking/tracker/base/basetracker.py b/PaddleCV/tracking/pytracking/tracker/base/basetracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..77e473d2d77239f8471a43ca015fb717dfa0fdfe
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/tracker/base/basetracker.py
@@ -0,0 +1,286 @@
+import matplotlib
+
+matplotlib.use('TkAgg')
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import cv2 as cv
+import time
+import os
+
+
+class BaseTracker:
+ """Base class for all trackers."""
+
+ def __init__(self, params):
+ self.params = params
+
+ def initialize(self, image, state, class_info=None):
+ """Overload this function in your tracker. This should initialize the model."""
+ raise NotImplementedError
+
+ def track(self, image):
+ """Overload this function in your tracker. This should track in the frame and update the model."""
+ raise NotImplementedError
+
+ def track_sequence(self, sequence):
+ """Run tracker on a sequence."""
+
+ # Initialize
+ image = self._read_image(sequence.frames[0])
+
+ times = []
+ start_time = time.time()
+ self.initialize(image, sequence.init_state)
+ init_time = getattr(self, 'time', time.time() - start_time)
+ times.append(init_time)
+
+ if self.params.visualization:
+ self.init_visualization()
+ self.visualize(image, sequence.init_state)
+
+ # Track
+ tracked_bb = [sequence.init_state]
+ for frame in sequence.frames[1:]:
+ image = self._read_image(frame)
+
+ start_time = time.time()
+ state = self.track(image)
+ times.append(time.time() - start_time)
+
+ tracked_bb.append(state)
+
+ if self.params.visualization:
+ self.visualize(image, state)
+
+ return tracked_bb, times
+
+ def track_videofile(self, videofilepath, optional_box=None):
+ """Run track with a video file input."""
+
+ assert os.path.isfile(videofilepath), "Invalid param {}".format(
+ videofilepath)
+ ", videofilepath must be a valid videofile"
+
+ if hasattr(self, 'initialize_features'):
+ self.initialize_features()
+
+ cap = cv.VideoCapture(videofilepath)
+ display_name = 'Display: ' + self.params.tracker_name
+ cv.namedWindow(display_name, cv.WINDOW_NORMAL | cv.WINDOW_KEEPRATIO)
+ cv.resizeWindow(display_name, 960, 720)
+ success, frame = cap.read()
+ cv.imshow(display_name, frame)
+ if success is not True:
+ print("Read frame from {} failed.".format(videofilepath))
+ exit(-1)
+ if optional_box is not None:
+ assert isinstance(optional_box, list, tuple)
+ assert len(optional_box) == 4, "valid box's foramt is [x,y,w,h]"
+ self.initialize(frame, optional_box)
+ else:
+ while True:
+ # cv.waitKey()
+ frame_disp = frame.copy()
+
+ cv.putText(frame_disp, 'Select target ROI and press ENTER',
+ (20, 30), cv.FONT_HERSHEY_COMPLEX_SMALL, 1.5,
+ (0, 0, 0), 1)
+
+ x, y, w, h = cv.selectROI(
+ display_name, frame_disp, fromCenter=False)
+ init_state = [x, y, w, h]
+ self.initialize(frame, init_state)
+ break
+
+ while True:
+ ret, frame = cap.read()
+
+ if frame is None:
+ return
+
+ frame_disp = frame.copy()
+
+ # Draw box
+ state = self.track(frame)
+ state = [int(s) for s in state]
+ cv.rectangle(frame_disp, (state[0], state[1]),
+ (state[2] + state[0], state[3] + state[1]),
+ (0, 255, 0), 5)
+
+ font_color = (0, 0, 0)
+ cv.putText(frame_disp, 'Tracking!', (20, 30),
+ cv.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)
+ cv.putText(frame_disp, 'Press r to reset', (20, 55),
+ cv.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)
+ cv.putText(frame_disp, 'Press q to quit', (20, 80),
+ cv.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)
+
+ # Display the resulting frame
+ cv.imshow(display_name, frame_disp)
+ key = cv.waitKey(1)
+ if key == ord('q'):
+ break
+ elif key == ord('r'):
+ ret, frame = cap.read()
+ frame_disp = frame.copy()
+
+ cv.putText(frame_disp, 'Select target ROI and press ENTER',
+ (20, 30), cv.FONT_HERSHEY_COMPLEX_SMALL, 1.5,
+ (0, 0, 0), 1)
+
+ cv.imshow(display_name, frame_disp)
+ x, y, w, h = cv.selectROI(
+ display_name, frame_disp, fromCenter=False)
+ init_state = [x, y, w, h]
+ self.initialize(frame, init_state)
+
+ # When everything done, release the capture
+ cap.release()
+ cv.destroyAllWindows()
+
+ def track_webcam(self):
+ """Run tracker with webcam."""
+
+ class UIControl:
+ def __init__(self):
+ self.mode = 'init' # init, select, track
+ self.target_tl = (-1, -1)
+ self.target_br = (-1, -1)
+ self.mode_switch = False
+
+ def mouse_callback(self, event, x, y, flags, param):
+ if event == cv.EVENT_LBUTTONDOWN and self.mode == 'init':
+ self.target_tl = (x, y)
+ self.target_br = (x, y)
+ self.mode = 'select'
+ self.mode_switch = True
+ elif event == cv.EVENT_MOUSEMOVE and self.mode == 'select':
+ self.target_br = (x, y)
+ elif event == cv.EVENT_LBUTTONDOWN and self.mode == 'select':
+ self.target_br = (x, y)
+ self.mode = 'track'
+ self.mode_switch = True
+
+ def get_tl(self):
+ return self.target_tl if self.target_tl[0] < self.target_br[
+ 0] else self.target_br
+
+ def get_br(self):
+ return self.target_br if self.target_tl[0] < self.target_br[
+ 0] else self.target_tl
+
+ def get_bb(self):
+ tl = self.get_tl()
+ br = self.get_br()
+
+ bb = [tl[0], tl[1], br[0] - tl[0], br[1] - tl[1]]
+ return bb
+
+ ui_control = UIControl()
+ cap = cv.VideoCapture(0)
+ display_name = 'Display: ' + self.params.tracker_name
+ cv.namedWindow(display_name, cv.WINDOW_NORMAL | cv.WINDOW_KEEPRATIO)
+ cv.resizeWindow(display_name, 960, 720)
+ cv.setMouseCallback(display_name, ui_control.mouse_callback)
+
+ if hasattr(self, 'initialize_features'):
+ self.initialize_features()
+
+ while True:
+ # Capture frame-by-frame
+ ret, frame = cap.read()
+ frame_disp = frame.copy()
+
+ if ui_control.mode == 'track' and ui_control.mode_switch:
+ ui_control.mode_switch = False
+ init_state = ui_control.get_bb()
+ self.initialize(frame, init_state)
+
+ # Draw box
+ if ui_control.mode == 'select':
+ cv.rectangle(frame_disp,
+ ui_control.get_tl(),
+ ui_control.get_br(), (255, 0, 0), 2)
+ elif ui_control.mode == 'track':
+ state = self.track(frame)
+ state = [int(s) for s in state]
+ cv.rectangle(frame_disp, (state[0], state[1]),
+ (state[2] + state[0], state[3] + state[1]),
+ (0, 255, 0), 5)
+
+ # Put text
+ font_color = (0, 0, 0)
+ if ui_control.mode == 'init' or ui_control.mode == 'select':
+ cv.putText(frame_disp, 'Select target', (20, 30),
+ cv.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)
+ cv.putText(frame_disp, 'Press q to quit', (20, 55),
+ cv.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)
+ elif ui_control.mode == 'track':
+ cv.putText(frame_disp, 'Tracking!', (20, 30),
+ cv.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)
+ cv.putText(frame_disp, 'Press r to reset', (20, 55),
+ cv.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)
+ cv.putText(frame_disp, 'Press q to quit', (20, 80),
+ cv.FONT_HERSHEY_COMPLEX_SMALL, 1, font_color, 1)
+ # Display the resulting frame
+ cv.imshow(display_name, frame_disp)
+ key = cv.waitKey(1)
+ if key == ord('q'):
+ break
+ elif key == ord('r'):
+ ui_control.mode = 'init'
+
+ # When everything done, release the capture
+ cap.release()
+ cv.destroyAllWindows()
+
+ def reset_tracker(self):
+ pass
+
+ def press(self, event):
+ if event.key == 'p':
+ self.pause_mode = not self.pause_mode
+ print("Switching pause mode!")
+ elif event.key == 'r':
+ self.reset_tracker()
+ print("Resetting target pos to gt!")
+
+ def init_visualization(self):
+ # plt.ion()
+ self.pause_mode = False
+ self.fig, self.ax = plt.subplots(1)
+ self.fig.canvas.mpl_connect('key_press_event', self.press)
+ plt.tight_layout()
+
+ def visualize(self, image, state):
+ self.ax.cla()
+ self.ax.imshow(image)
+ rect = patches.Rectangle(
+ (state[0], state[1]),
+ state[2],
+ state[3],
+ linewidth=1,
+ edgecolor='r',
+ facecolor='none')
+ self.ax.add_patch(rect)
+
+ if hasattr(self, 'gt_state') and False:
+ gt_state = self.gt_state
+ rect = patches.Rectangle(
+ (gt_state[0], gt_state[1]),
+ gt_state[2],
+ gt_state[3],
+ linewidth=1,
+ edgecolor='g',
+ facecolor='none')
+ self.ax.add_patch(rect)
+ self.ax.set_axis_off()
+ self.ax.axis('equal')
+ plt.draw()
+ plt.pause(0.001)
+
+ if self.pause_mode:
+ plt.waitforbuttonpress()
+
+ def _read_image(self, image_file: str):
+ return cv.cvtColor(cv.imread(image_file), cv.COLOR_BGR2RGB)
diff --git a/PaddleCV/tracking/pytracking/tracker/siamfc/__init__.py b/PaddleCV/tracking/pytracking/tracker/siamfc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..91cf4214c152ea9863259998d79ab9b313e64338
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/tracker/siamfc/__init__.py
@@ -0,0 +1,5 @@
+from .siamfc import SiamFC
+
+
+def get_tracker_class():
+ return SiamFC
diff --git a/PaddleCV/tracking/pytracking/tracker/siamfc/eval_siamfc_otb.py b/PaddleCV/tracking/pytracking/tracker/siamfc/eval_siamfc_otb.py
new file mode 100644
index 0000000000000000000000000000000000000000..d71b1f10009be2bfda952e2c1832965b9c76cd2a
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/tracker/siamfc/eval_siamfc_otb.py
@@ -0,0 +1,170 @@
+import os
+import numpy as np
+
+from PIL import Image
+
+import os.path as osp
+import sys
+CURRENT_DIR = osp.dirname(__file__)
+sys.path.append(osp.join(CURRENT_DIR, '..', '..', '..'))
+
+from pytracking.pysot_toolkit.utils import success_overlap, success_error
+import json
+from pytracking.tracker.siamfc.siamfc import SiamFC
+
+from tqdm import tqdm
+
+from pytracking.parameter.siamfc.default import parameters
+
+
+class ValidOTB(SiamFC):
+ def __init__(self, dataset_root, dataset_name, params):
+ super(ValidOTB, self).__init__(params)
+ """
+ dataset_root: the root directory of dataset
+ dataset_name: the name of OTB dataste, [CVPR2013, OTB50, OTB100]
+ """
+ self.params = self.params
+ self.root_path = dataset_root
+ if not os.path.exists(self.root_path):
+ raise Exception("'{}' does not exists".format(self.root_path))
+
+ dataset_list = ['CVPR13', 'OTB2013', 'OTB100', 'OTB50']
+ if dataset_name not in dataset_list:
+ raise Exception("ValidOTB's dataset_name can only be one of {}".
+ format(dataset_list))
+ if dataset_name == 'OTB2013':
+ dataset_name = 'CVPR13'
+ self.dataset_name = dataset_name
+ self.otb2013_json = os.path.join(self.root_path, dataset_name + '.json')
+
+ self.meta_data = json.load(open(self.otb2013_json, 'rb'))
+ self.video_name = list(self.meta_data.keys())
+
+ def inference(self, epoch):
+
+ gtbb = []
+ prebb = []
+ """ add save dir """
+ save_dir = "./eval_otb13/epoch_" + str(epoch)
+ if not os.path.exists(save_dir):
+ os.makedirs(save_dir)
+
+ # load videos
+ pbar = tqdm(
+ self.meta_data.keys(),
+ desc='loading ' + self.dataset_name,
+ ncols=100)
+ for idx, vid in enumerate(pbar):
+ pbar.set_postfix_str(vid)
+
+ gt_boxs = self.meta_data[vid]['gt_rect']
+ start_frame, end_frame = 0, len(gt_boxs)
+ img_list = self.meta_data[vid]['img_names']
+ assert len(img_list) == len(gt_boxs)
+
+ gt_box_list = []
+ pre_box_list = []
+ for i in range(start_frame, end_frame):
+ img = Image.open(os.path.join(self.root_path, img_list[i]))
+ if len(img.size) < 3 or img.size[-1] == 1:
+ img = img.convert('RGB')
+
+ gt_box = gt_boxs[i - start_frame]
+
+ if i == start_frame:
+ self.initialize(image=img, state=gt_box)
+ pre_box_list.append(gt_box)
+ gt_box_list.append(gt_box)
+ continue
+ else:
+ pre_box = self.track(img)
+
+ pre_box_list.append(list(pre_box))
+ gt_box_list.append(gt_box)
+
+ gtbb += gt_box_list
+ prebb += pre_box_list
+ """ add save_dir"""
+ vid_save_dir = os.path.join(save_dir, vid + '.txt')
+ with open(vid_save_dir, 'w') as f:
+ outputs = []
+ for res in pre_box_list:
+ outputs.append('{},{},{},{}'.format(res[0], res[1], res[2],
+ res[3]))
+ f.write('\n'.join(outputs))
+
+ auc = success_overlap(np.array(gtbb), np.array(prebb), len(gtbb))
+
+ thresholds = np.arange(0, 51, 1)
+ gt_center = self.convert_bb_to_center(np.array(gtbb))
+ tracker_center = self.convert_bb_to_center(np.array(prebb))
+ precision = success_error(
+ np.array(gt_center),
+ np.array(tracker_center), thresholds, len(gtbb))
+ print("####AUC:{}, Precision:{}".format(
+ np.mean(auc), np.mean(precision)))
+
+ return np.mean(auc), np.mean(precision)
+
+ def convert_bb_to_center(self, bboxes):
+ return np.array([(bboxes[:, 0] + (bboxes[:, 2] - 1) / 2),
+ (bboxes[:, 1] + (bboxes[:, 3] - 1) / 2)]).T
+
+
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument(
+ '--checkpoint',
+ type=str,
+ default="./checkpoint/",
+ help="the path of saved siamfc params file")
+parser.add_argument(
+ '--dataset_dir',
+ type=str,
+ default="/paddle/Datasets/OTB100",
+ help="the path of OTB dataset")
+parser.add_argument(
+ '--dataset_name',
+ type=str,
+ default="CVPR13",
+ help="can only be one of [CVPR13, OTB2013, OTB50, OTB100]")
+
+parser.add_argument(
+ '--start_epoch',
+ type=int,
+ default=1,
+ help="evaluate from start_epoch epoch, greater than 1")
+parser.add_argument(
+ '--end_epoch',
+ type=int,
+ default=50,
+ help="evaluate ends at end_epoch epoch, smaller than 50 ")
+
+args = parser.parse_args()
+
+if __name__ == '__main__':
+
+ params = parameters()
+ params.net_path = args.checkpoint
+ start_epoch = args.start_epoch
+ end_epoch = args.end_epoch
+
+ assert start_epoch >= 1 and end_epoch <= 50 and start_epoch < end_epoch
+
+ best_auc, best_epoch = 0, start_epoch
+
+ for i in range(start_epoch, end_epoch, 1):
+ params.net_path = os.path.join(args.checkpoint, "SiamNet_ep%004d" % i)
+ valid = ValidOTB(
+ dataset_root=args.dataset_dir,
+ dataset_name=args.dataset_name,
+ params=params)
+
+ auc, precision = valid.inference(epoch=i)
+
+ if auc > best_auc:
+ best_auc = auc
+ best_epoch = i
+ print("####Best AUC is {}, corresponding epoch is {}".format(
+ best_auc, best_epoch))
diff --git a/PaddleCV/tracking/pytracking/tracker/siamfc/eval_siamfc_vot.py b/PaddleCV/tracking/pytracking/tracker/siamfc/eval_siamfc_vot.py
new file mode 100644
index 0000000000000000000000000000000000000000..e12448cf17121c80c6c72a7e86f0b2dbf50e7e1b
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/tracker/siamfc/eval_siamfc_vot.py
@@ -0,0 +1,258 @@
+import os
+import numpy as np
+
+from PIL import Image
+
+import os.path as osp
+import sys
+CURRENT_DIR = osp.dirname(__file__)
+sys.path.append(osp.join(CURRENT_DIR, '..', '..', '..'))
+
+import json
+from pytracking.tracker.siamfc.siamfc import SiamFC
+
+from tqdm import tqdm
+
+from pytracking.parameter.siamfc.default import parameters
+
+
+class ValidVOT(SiamFC):
+ def __init__(self, dataset_root, dataset_name, params):
+ super(ValidVOT, self).__init__(params)
+ """
+ dataset_root: the root directory of dataset
+ dataset_name: the name of VOT dataste, [VOt2015, VOT2018, ...]
+ """
+ self.params = self.params
+ self.root_path = dataset_root
+ if not os.path.exists(self.root_path):
+ raise Exception("'{}' does not exists".format(self.root_path))
+
+ dataset_list = ['VOT2015', 'VOT2018']
+ if dataset_name not in dataset_list:
+ raise Exception("ValidVOT's dataset_name can only be one of {}".
+ format(dataset_list))
+
+ self.dataset_name = dataset_name
+ self.vot2013_json = os.path.join(self.root_path, dataset_name + '.json')
+ # self.otb2013_json = "/paddle/Datasets/OTB100/CVPR13.json"
+
+ self.meta_data = json.load(open(self.vot2013_json, 'rb'))
+ self.video_name = list(self.meta_data.keys())
+
+ def inference_reinit(self, epoch, start_frame=0):
+
+ # video-wised
+ vid_num = len(self.video_name)
+ vid_ious = np.zeros(vid_num)
+ vid_length = np.zeros(vid_num)
+ fail_num = np.zeros(vid_num)
+
+ burn_in_period = 5
+ pbar = tqdm(
+ self.meta_data.keys(),
+ desc='loading ' + self.dataset_name,
+ ncols=100)
+
+ for idx, vid in enumerate(pbar):
+ pbar.set_postfix_str(vid)
+
+ gt_boxs = self.meta_data[vid]['gt_rect']
+ img_list = self.meta_data[vid]['img_names']
+ imgs_num = len(img_list)
+
+ gt_box_list = []
+ pre_box_list = []
+
+ valid_frames_num = imgs_num - start_frame
+ step = start_frame
+ reinit = True
+ re_init_frame = step
+ while step < imgs_num:
+ img = Image.open(os.path.join(self.root_path, img_list[step]))
+
+ gt_box = list(map(float, self.region_to_bbox(gt_boxs[step])))
+
+ if reinit:
+ # the tracker was initialized
+ # five frames after the failure
+ self.initialize(img, gt_box)
+ reinit = False
+ # print("reinit, vid: {}, step: {}, failnum: {}".format(vid, step, fail_num[idx]))
+ continue
+
+ pre_box = self.track(img)
+ if step - re_init_frame < 10:
+ # burn in period
+ step += 1
+ valid_frames_num -= 1
+ continue
+
+ pre_box_list.append(list(pre_box))
+ gt_box_list.append(gt_box)
+
+ iou = self._compute_iou(pre_box, gt_box)
+ vid_ious[idx] += iou
+
+ if iou == 0.:
+ reinit = True
+
+ fail_num[idx] += 1
+ # the tracker was initialized
+ # five frames after the failure
+ step += burn_in_period
+ re_init_frame = step
+ valid_frames_num -= burn_in_period
+ step += 1
+
+ vid_length[idx] = valid_frames_num
+ #print("idx: {}, vid: {}, failure: {}, miou: {}\n".format(idx, vid, fail_num[idx],
+ # vid_ious[idx]/valid_frames_num))
+
+ acc = np.sum(vid_ious) / np.sum(vid_length)
+ print("##########Evaluation##########")
+ print("##acc = {}".format(acc))
+ print("##failure = {}".format(np.sum(fail_num)))
+
+ return acc, np.sum(fail_num)
+
+ def _compute_iou(self, box1, box2):
+ """
+ computing IoU
+ print("acc shape", acc.shape, "vid_length shape: ", vid_length.shape)
+ print("acc shape", acc.shape, "vid_length shape: ", vid_length.shape)
+ :param rec1: (x0, y0, w, h), which reflects
+ (top, left, bottom, right)
+ :param rec2: (x0, y0, w, h)
+ :return: scala value of IoU
+ """
+ rec1 = box1
+ rec2 = box2
+ # computing area of each rectangles
+ S_rec1 = (rec1[2] + 1) * (rec1[3] + 1)
+ S_rec2 = (rec2[2] + 1) * (rec2[3] + 1)
+
+ # computing the sum_area
+ sum_area = S_rec1 + S_rec2
+
+ # find the each edge of intersect rectangle
+ left_line = max(rec1[1], rec2[1])
+ right_line = min(rec1[3] + rec1[1], rec2[3] + rec2[1])
+ top_line = max(rec1[0], rec2[0])
+ bottom_line = min(rec1[2] + rec2[0], rec2[2] + rec2[0])
+
+ # judge if there is an intersect
+ if left_line >= right_line or top_line >= bottom_line:
+ iou = 0.
+ else:
+ intersect = (right_line - left_line + 1) * (
+ bottom_line - top_line + 1)
+ iou = (intersect / (sum_area - intersect)) * 1.0
+ assert iou >= 0
+ assert iou <= 1.01
+ return iou
+
+ def region_to_bbox(self, region, center=False):
+
+ n = len(region)
+ region = np.array(region)
+ assert n == 4 or n == 8, (
+ 'GT region format is invalid, should have 4 or 8 entries.')
+
+ # we assume the grountruth bounding boxes are saved with 0-indexing
+ def _rect(region, center):
+
+ if center:
+ x = region[0]
+ y = region[1]
+ w = region[2]
+ h = region[3]
+ cx = x + w / 2
+ cy = y + h / 2
+ return cx, cy, w, h
+ else:
+ region[0] -= 1
+ region[1] -= 1
+ return region
+
+ def _poly(region, center):
+ cx = np.mean(region[::2])
+ cy = np.mean(region[1::2])
+ x1 = np.min(region[::2])
+ x2 = np.max(region[::2])
+ y1 = np.min(region[1::2])
+ y2 = np.max(region[1::2])
+ A1 = np.linalg.norm(region[0:2] - region[2:4]) * np.linalg.norm(
+ region[2:4] - region[4:6])
+ A2 = (x2 - x1) * (y2 - y1)
+ s = np.sqrt(A1 / A2)
+ w = s * (x2 - x1) + 1
+ h = s * (y2 - y1) + 1
+
+ if center:
+ return cx, cy, w, h
+ else:
+ return cx - w / 2, cy - h / 2, w, h
+
+ if n == 4:
+ return _rect(region, center)
+ else:
+ return _poly(region, center)
+
+
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument(
+ '--checkpoint',
+ type=str,
+ default="./checkpoint/",
+ help="the path of saved siamfc params file")
+parser.add_argument(
+ '--dataset_dir',
+ type=str,
+ default="/paddle/Datasets/VOT2015",
+ help="the path of VOT dataset")
+parser.add_argument(
+ '--dataset_name',
+ type=str,
+ default="VOT2015",
+ help="can only be one of [VOT2015, VOT2018]")
+
+parser.add_argument(
+ '--start_epoch',
+ type=int,
+ default=1,
+ help="evaluate from start_epoch epoch, greater than 1")
+parser.add_argument(
+ '--end_epoch',
+ type=int,
+ default=50,
+ help="evaluate ends at end_epoch epoch, smaller than 50 ")
+
+args = parser.parse_args()
+
+if __name__ == '__main__':
+
+ params = parameters()
+ params.net_path = args.checkpoint
+ start_epoch = args.start_epoch
+ end_epoch = args.end_epoch
+
+ assert start_epoch >= 1 and end_epoch <= 50 and start_epoch < end_epoch
+
+ best_acc, best_failure, best_epoch = 0, 100, start_epoch
+
+ for i in range(start_epoch, end_epoch, 2):
+ params.net_path = os.path.join(args.checkpoint, "SiamNet_ep%004d" % i)
+ valid = ValidVOT(
+ dataset_root=args.dataset_dir,
+ dataset_name=args.dataset_name,
+ params=params)
+
+ acc, failure = valid.inference_reinit(epoch=i)
+ print("####Epoch: {}, ACC: {}, Failure: {}".format(i, acc, failure))
+ if acc > best_acc and failure <= 84:
+ best_acc = acc
+ best_epoch = i
+ print("####Best ACC: {}, Failure: {}, corresponding epoch: {}".
+ format(best_acc, failure, best_epoch))
diff --git a/PaddleCV/tracking/pytracking/tracker/siamfc/siamfc.py b/PaddleCV/tracking/pytracking/tracker/siamfc/siamfc.py
new file mode 100644
index 0000000000000000000000000000000000000000..704fafe389e1607f843af20fae2590819b4c21d5
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/tracker/siamfc/siamfc.py
@@ -0,0 +1,208 @@
+import time
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.fluid import dygraph
+
+from pytracking.tracker.base.basetracker import BaseTracker
+
+from ltr.models.siamese.siam import siamfc_alexnet
+
+import cv2
+# for debug
+from pytracking.parameter.siamfc.default import parameters
+
+
+class SiamFC(BaseTracker):
+ def __init__(self, params=parameters()):
+
+ self.params = params
+ self.model_initializer()
+
+ def initialize_features(self):
+ if not getattr(self, 'features_initialized', False):
+ self.params.features.initialize()
+ self.features_initialized = True
+
+ def model_initializer(self):
+ import os
+ net_path = self.params.net_path
+ if net_path is None:
+ net_path = self.params.features.features[0].net_path
+ if not os.path.exists(net_path):
+ raise Exception("not found {}".format(net_path))
+ with dygraph.guard():
+ self.model = siamfc_alexnet(backbone_is_test=True)
+ #state_dict, _ = fluid.load_dygraph(net_path)
+ weight_params, opt_params = fluid.load_dygraph(net_path)
+ state_dict = self.model.state_dict()
+ for k1, k2 in zip(state_dict.keys(), weight_params.keys()):
+ if list(state_dict[k1].shape) == list(weight_params[k2].shape):
+ state_dict[k1].set_value(weight_params[k2])
+ else:
+ raise Exception("ERROR, shape not match")
+ self.model.load_dict(state_dict)
+ self.model.eval()
+
+ def _cosine_window(self, size):
+ """
+ get the cosine window
+ """
+ cos_window = np.hanning(int(size[0]))[:, np.newaxis].dot(
+ np.hanning(int(size[1]))[np.newaxis, :])
+ cos_window = cos_window.astype(np.float32)
+ cos_window /= np.sum(cos_window)
+ return cos_window
+
+ def initialize(self, image, state, *args, **kwargs):
+ # state (x, y, w, h)
+ # Initialize some stuff
+ self.frame_num = 1
+ self.time = 0
+
+ # Get position and size
+ box = state
+ image = np.asarray(image)
+ # convert box to 0-indexed and center based [y, x, h, w]
+ box = np.array(
+ [
+ box[1] - 1 + (box[3] - 1) / 2, box[0] - 1 + (box[2] - 1) / 2,
+ box[3], box[2]
+ ],
+ dtype=np.float32)
+ self.center, self.target_sz = box[:2], box[2:]
+
+ # create hanning window
+ self.upscale_sz = self.params.response_up * self.params.response_sz
+ self.hann_window = np.outer(
+ np.hanning(self.upscale_sz), np.hanning(self.upscale_sz))
+ self.hann_window /= self.hann_window.sum()
+
+ # search scale factors
+ self.scale_factors = self.params.scale_step**np.linspace(
+ -(self.params.scale_num // 2), self.params.scale_num // 2,
+ self.params.scale_num)
+
+ # exemplar and search sizes
+ context = self.params.context * np.sum(self.target_sz)
+ self.z_sz = np.sqrt(np.prod(self.target_sz + context))
+ self.x_sz = self.z_sz * \
+ self.params.instance_sz / self.params.exemplar_sz
+
+ # exemplar image
+ self.avg_color = np.mean(image, axis=(0, 1))
+ exemplar_image = self._crop_and_resize(
+ image,
+ self.center,
+ self.z_sz,
+ out_size=self.params.exemplar_sz,
+ pad_color=self.avg_color)
+ self.exemplar_img_1s = exemplar_image[np.newaxis, :, :, :]
+ self.exemplar_img = np.transpose(self.exemplar_img_1s,
+ [0, 3, 1, 2]).astype(np.float32)
+ self.exemplar_img = np.repeat(
+ self.exemplar_img, self.params.scale_num, axis=0)
+
+ def _crop_and_resize(self, image, center, size, out_size, pad_color):
+ # convert box to corners (0-indexed)
+ size = round(size)
+ corners = np.concatenate((np.round(center - (size - 1) / 2),
+ np.round(center - (size - 1) / 2) + size))
+ corners = np.round(corners).astype(int)
+
+ # pad image if necessary
+ pads = np.concatenate((-corners[:2], corners[2:] - image.shape[:2]))
+ npad = max(0, int(pads.max()))
+ if npad > 0:
+ image = cv2.copyMakeBorder(
+ image,
+ npad,
+ npad,
+ npad,
+ npad,
+ cv2.BORDER_CONSTANT,
+ value=pad_color)
+
+ # crop image patch
+ corners = (corners + npad).astype(int)
+ patch = image[corners[0]:corners[2], corners[1]:corners[3]]
+
+ # resize to out_size
+ patch = cv2.resize(patch, (out_size, out_size))
+
+ return patch
+
+ def track(self, image):
+ #print("## track, input image shape:", image.shape)
+ self.frame_num += 1
+
+ image = np.asarray(image)
+ # search images
+ instance_images = [
+ self._crop_and_resize(
+ image,
+ self.center,
+ self.x_sz * f,
+ out_size=self.params.instance_sz,
+ pad_color=self.avg_color) for f in self.scale_factors
+ ]
+ instance_images = np.stack(instance_images, axis=0)
+ instance_images = np.transpose(instance_images,
+ [0, 3, 1, 2]).astype(np.float32)
+
+ # calculate response
+ # exemplar features
+ with fluid.dygraph.guard():
+ instance_images = fluid.dygraph.to_variable(instance_images)
+ self.exemplar_img = fluid.dygraph.to_variable(self.exemplar_img)
+ responses = self.model(self.exemplar_img, instance_images)
+
+ responses = responses.numpy()
+
+ responses = np.squeeze(responses, axis=1)
+ # upsample responses and penalize scale changes
+ responses = np.stack(
+ [
+ cv2.resize(
+ t, (self.upscale_sz, self.upscale_sz),
+ interpolation=cv2.INTER_CUBIC) for t in responses
+ ],
+ axis=0)
+ responses[:self.params.scale_num // 2] *= self.params.scale_penalty
+ responses[self.params.scale_num // 2 + 1:] *= self.params.scale_penalty
+
+ # peak scale
+ scale_list = np.amax(responses, axis=(1, 2))
+ scale_id = np.argmax(scale_list)
+ #scale_id = np.argmax(np.amax(responses, axis=(1, 2)))
+ # peak location
+ response = responses[scale_id]
+ response -= response.min()
+ response /= response.sum() + 1e-16
+ response = (1 - self.params.window_influence) * response + \
+ self.params.window_influence * self.hann_window
+ loc = np.unravel_index(response.argmax(), response.shape)
+
+ # locate target center
+ disp_in_response = np.array(loc) - (self.upscale_sz - 1.) / 2
+ disp_in_instance = disp_in_response * \
+ self.params.total_stride / self.params.response_up
+ disp_in_image = disp_in_instance * self.x_sz * \
+ self.scale_factors[scale_id] / self.params.instance_sz
+ self.center += disp_in_image
+
+ # update target size
+ scale = (1 - self.params.scale_lr) * 1.0 + \
+ self.params.scale_lr * self.scale_factors[scale_id]
+ self.target_sz *= scale
+ self.z_sz *= scale
+ self.x_sz *= scale
+
+ # return 1-indexed and left-top based bounding box
+ box = np.array([
+ self.center[1] + 1 - (self.target_sz[1] - 1) / 2,
+ self.center[0] + 1 - (self.target_sz[0] - 1) / 2, self.target_sz[1],
+ self.target_sz[0]
+ ])
+
+ return box
diff --git a/PaddleCV/tracking/pytracking/utils/__init__.py b/PaddleCV/tracking/pytracking/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99c377521f541c01de31c6e02813d362ff4d7c4
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/utils/__init__.py
@@ -0,0 +1,2 @@
+# from .evaluation import *
+from .params import *
diff --git a/PaddleCV/tracking/pytracking/utils/params.py b/PaddleCV/tracking/pytracking/utils/params.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcaf3d6831366dfb7d91584cf6ed488846261efd
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/utils/params.py
@@ -0,0 +1,31 @@
+from pytracking.libs import TensorList
+import random
+
+
+class TrackerParams:
+ """Class for tracker parameters."""
+
+ def free_memory(self):
+ for a in dir(self):
+ if not a.startswith('__') and hasattr(
+ getattr(self, a), 'free_memory'):
+ getattr(self, a).free_memory()
+
+
+class FeatureParams:
+ """Class for feature specific parameters"""
+
+ def __init__(self, *args, **kwargs):
+ if len(args) > 0:
+ raise ValueError
+
+ for name, val in kwargs.items():
+ if isinstance(val, list):
+ setattr(self, name, TensorList(val))
+ else:
+ setattr(self, name, val)
+
+
+def Choice(*args):
+ """Can be used to sample random parameter values."""
+ return random.choice(args)
diff --git a/PaddleCV/tracking/pytracking/utils/plotting.py b/PaddleCV/tracking/pytracking/utils/plotting.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef95655b47a35bf186939006b356986847ba3971
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/utils/plotting.py
@@ -0,0 +1,50 @@
+import matplotlib
+matplotlib.use('TkAgg')
+import matplotlib.pyplot as plt
+import numpy as np
+from pytracking.libs.paddle_utils import p2n, PTensor
+
+
+def save_tensor(a: PTensor, save_name):
+ a_np = p2n(a)
+ np.save(save_name, a_np)
+
+
+def show_tensor(a: PTensor, fig_num=None, title=None):
+ """Display a 2D tensor.
+ args:
+ fig_num: Figure number.
+ title: Title of figure.
+ """
+ a_np = a.squeeze().cpu().clone().detach().numpy()
+ if a_np.ndim == 3:
+ a_np = np.transpose(a_np, (1, 2, 0))
+ plt.figure(fig_num)
+ plt.tight_layout()
+ plt.cla()
+ plt.imshow(a_np)
+ plt.axis('off')
+ plt.axis('equal')
+ if title is not None:
+ plt.title(title)
+ plt.draw()
+ plt.pause(0.001)
+
+
+def plot_graph(a: PTensor, fig_num=None, title=None):
+ """Plot graph. Data is a 1D tensor.
+ args:
+ fig_num: Figure number.
+ title: Title of figure.
+ """
+ a_np = a.squeeze().cpu().clone().detach().numpy()
+ if a_np.ndim > 1:
+ raise ValueError
+ plt.figure(fig_num)
+ # plt.tight_layout()
+ plt.cla()
+ plt.plot(a_np)
+ if title is not None:
+ plt.title(title)
+ plt.draw()
+ plt.pause(0.001)
diff --git a/PaddleCV/tracking/pytracking/visualize_results_on_benchmark.ipynb b/PaddleCV/tracking/pytracking/visualize_results_on_benchmark.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..3207694a14580dbee99284492724d7d32ce82edc
--- /dev/null
+++ b/PaddleCV/tracking/pytracking/visualize_results_on_benchmark.ipynb
@@ -0,0 +1,176 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Load dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import sys\n",
+ "import cv2 as cv\n",
+ "import numpy as np\n",
+ "from matplotlib.pyplot import Rectangle\n",
+ "from videofig import videofig\n",
+ "\n",
+ "sys.path.append('..')\n",
+ "from pytracking.pysot_toolkit.datasets import DatasetFactory\n",
+ "from pytracking.pysot_toolkit.environment import env_settings\n",
+ "\n",
+ "# set the dataset name here\n",
+ "dataset_name = 'CVPR13'\n",
+ "\n",
+ "if dataset_name in ['CVPR13', 'OTB50', 'OTB100']:\n",
+ " # for OTB datasets, we save results into the same directory\n",
+ " save_dataset_name = 'OTB100'\n",
+ "else:\n",
+ " save_dataset_name = dataset_name\n",
+ "\n",
+ "dataset_root = os.path.join(env_settings().dataset_path, save_dataset_name)\n",
+ "\n",
+ "# load dataset\n",
+ "dataset = DatasetFactory.create_dataset(name=dataset_name, dataset_root=dataset_root, load_img=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dataset.videos.keys()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Select results to show"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tracker_test_params = 'siamfc.default'\n",
+ "exp_id = 'siamfc.siamfc_alexnet_vid.epoch49'\n",
+ "videoname = 'Bolt'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Show"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%matplotlib notebook\n",
+ "\n",
+ "if 'OTB100' == save_dataset_name:\n",
+ " filename = '{}.txt'.format(videoname)\n",
+ "elif 'VOT' in save_dataset_name:\n",
+ " filename = 'baseline/{vname}/{vname}_001.txt'.format(vname=videoname)\n",
+ "else:\n",
+ " raise NotImplemented\n",
+ " \n",
+ "video = dataset[videoname]\n",
+ "\n",
+ "# load tracking results\n",
+ "boxs = []\n",
+ "with open(os.path.join(env_settings().results_path, save_dataset_name, tracker_test_params, exp_id, filename), 'r') as file_handle:\n",
+ " for line in file_handle:\n",
+ " boxs.append([float(v) for v in line.strip().split(',')])\n",
+ "\n",
+ "def redraw_fn(f, ax):\n",
+ " img_path, _ = video[f]\n",
+ " img = cv.cvtColor(cv.imread(img_path), cv.COLOR_BGR2RGB)\n",
+ " \n",
+ " box = boxs[f]\n",
+ " if len(box) == 4:\n",
+ " x, y, w, h = box\n",
+ " else:\n",
+ " x, y, w, h = 0, 0, 0, 0\n",
+ " \n",
+ " if not redraw_fn.initialized:\n",
+ " redraw_fn.img_handle = ax.imshow(img)\n",
+ " box_artist = Rectangle((x, y), w, h,\n",
+ " fill=False, # remove background\n",
+ " lw=2,\n",
+ " edgecolor=\"red\")\n",
+ " ax.add_patch(box_artist)\n",
+ " redraw_fn.box_handle = box_artist\n",
+ " redraw_fn.text_handle = ax.text(0., 1 - 0.05,\n",
+ " 'Frame: {}'.format(f + 1),\n",
+ " transform=ax.transAxes,\n",
+ " color='yellow', size=12)\n",
+ " redraw_fn.initialized = True\n",
+ " else:\n",
+ " redraw_fn.img_handle.set_array(img)\n",
+ " redraw_fn.box_handle.set_xy((x, y))\n",
+ " redraw_fn.box_handle.set_width(w)\n",
+ " redraw_fn.box_handle.set_height(h)\n",
+ " redraw_fn.text_handle.set_text('Frame: {}'.format(f + 1))\n",
+ "\n",
+ "redraw_fn.initialized = False\n",
+ "\n",
+ "videofig(len(video), redraw_fn)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/PaddleCV/tracking/requirements.txt b/PaddleCV/tracking/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4d4c2c330feb049481311ec4f2098107f0769e4a
--- /dev/null
+++ b/PaddleCV/tracking/requirements.txt
@@ -0,0 +1,11 @@
+git+https://github.com/tensorpack/dataflow.git
+cython
+pycocotools
+lmdb
+pandas
+jpeg4py
+opencv-python
+tensorboardX
+videofig
+jupyter
+tqdm