Imported Dwarf parser from Facebook folly

daa36650 · Alexey Milovidov · 957b59f0 · daa36650 · daa36650 · daa36650
5 changed file
--- a/dbms/src/Common/Dwarf.cpp
+++ b/dbms/src/Common/Dwarf.cpp
--- a/dbms/src/Common/Dwarf.h
+++ b/dbms/src/Common/Dwarf.h
+#pragma once
+
+/*
+ * Copyright 2012-present Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** This file was edited for ClickHouse.
+  */
+
+#include <string>
+#include <string_view>
+#include <variant>
+
+
+namespace DB
+{
+
+class Elf;
+
+/**
+ * DWARF record parser.
+ *
+ * We only implement enough DWARF functionality to convert from PC address
+ * to file and line number information.
+ *
+ * This means (although they're not part of the public API of this class), we
+ * can parse Debug Information Entries (DIEs), abbreviations, attributes (of
+ * all forms), and we can interpret bytecode for the line number VM.
+ *
+ * We can interpret DWARF records of version 2, 3, or 4, although we don't
+ * actually support many of the version 4 features (such as VLIW, multiple
+ * operations per instruction)
+ *
+ * Note that the DWARF record parser does not allocate heap memory at all.
+ * This is on purpose: you can use the parser from
+ * memory-constrained situations (such as an exception handler for
+ * std::out_of_memory)  If it weren't for this requirement, some things would
+ * be much simpler: the Path class would be unnecessary and would be replaced
+ * with a std::string; the list of file names in the line number VM would be
+ * kept as a vector of strings instead of re-executing the program to look for
+ * DW_LNE_define_file instructions, etc.
+ */
+class Dwarf
+{
+    // Note that Dwarf uses (and returns) std::string_view a lot.
+    // The std::string_view point within sections in the ELF file, and so will
+    // be live for as long as the passed-in Elf is live.
+public:
+    /** Create a DWARF parser around an ELF file. */
+    explicit Dwarf(const Elf & elf);
+
+    /**
+      * Represent a file path a s collection of three parts (base directory,
+      * subdirectory, and file).
+      */
+    class Path
+    {
+    public:
+        Path() {}
+
+        Path(std::string_view baseDir, std::string_view subDir, std::string_view file);
+
+        std::string_view baseDir() const { return baseDir_; }
+        std::string_view subDir() const { return subDir_; }
+        std::string_view file() const { return file_; }
+
+        size_t size() const;
+
+        /**
+        * Copy the Path to a buffer of size bufSize.
+        *
+        * toBuffer behaves like snprintf: It will always null-terminate the
+        * buffer (so it will copy at most bufSize-1 bytes), and it will return
+        * the number of bytes that would have been written if there had been
+        * enough room, so, if toBuffer returns a value >= bufSize, the output
+        * was truncated.
+        */
+        size_t toBuffer(char * buf, size_t bufSize) const;
+
+        void toString(std::string & dest) const;
+        std::string toString() const
+        {
+            std::string s;
+            toString(s);
+            return s;
+        }
+
+        // TODO(tudorb): Implement operator==, operator!=; not as easy as it
+        // seems as the same path can be represented in multiple ways
+    private:
+        std::string_view baseDir_;
+        std::string_view subDir_;
+        std::string_view file_;
+    };
+
+    enum class LocationInfoMode
+    {
+        // Don't resolve location info.
+        DISABLED,
+        // Perform CU lookup using .debug_aranges (might be incomplete).
+        FAST,
+        // Scan all CU in .debug_info (slow!) on .debug_aranges lookup failure.
+        FULL,
+    };
+
+    struct LocationInfo
+    {
+        bool hasMainFile = false;
+        Path mainFile;
+
+        bool hasFileAndLine = false;
+        Path file;
+        uint64_t line = 0;
+    };
+
+    /**
+   * Find the file and line number information corresponding to address.
+   */
+    bool findAddress(uintptr_t address, LocationInfo & info, LocationInfoMode mode) const;
+
+private:
+    static bool findDebugInfoOffset(uintptr_t address, std::string_view aranges, uint64_t & offset);
+
+    void init();
+    bool findLocation(uintptr_t address, std::string_view & infoEntry, LocationInfo & info) const;
+
+    const Elf * elf_;
+
+    // DWARF section made up of chunks, each prefixed with a length header.
+    // The length indicates whether the chunk is DWARF-32 or DWARF-64, which
+    // guides interpretation of "section offset" records.
+    // (yes, DWARF-32 and DWARF-64 sections may coexist in the same file)
+    class Section
+    {
+    public:
+        Section() : is64Bit_(false) {}
+
+        explicit Section(std::string_view d);
+
+        // Return next chunk, if any; the 4- or 12-byte length was already
+        // parsed and isn't part of the chunk.
+        bool next(std::string_view & chunk);
+
+        // Is the current chunk 64 bit?
+        bool is64Bit() const { return is64Bit_; }
+
+    private:
+        // Yes, 32- and 64- bit sections may coexist.  Yikes!
+        bool is64Bit_;
+        std::string_view data_;
+    };
+
+    // Abbreviation for a Debugging Information Entry.
+    struct DIEAbbreviation
+    {
+        uint64_t code;
+        uint64_t tag;
+        bool hasChildren;
+
+        struct Attribute
+        {
+            uint64_t name;
+            uint64_t form;
+        };
+
+        std::string_view attributes;
+    };
+
+    // Interpreter for the line number bytecode VM
+    class LineNumberVM
+    {
+    public:
+        LineNumberVM(std::string_view data, std::string_view compilationDirectory);
+
+        bool findAddress(uintptr_t address, Path & file, uint64_t & line);
+
+    private:
+        void init();
+        void reset();
+
+        // Execute until we commit one new row to the line number matrix
+        bool next(std::string_view & program);
+        enum StepResult
+        {
+            CONTINUE, // Continue feeding opcodes
+            COMMIT, // Commit new <address, file, line> tuple
+            END, // End of sequence
+        };
+        // Execute one opcode
+        StepResult step(std::string_view & program);
+
+        struct FileName
+        {
+            std::string_view relativeName;
+            // 0 = current compilation directory
+            // otherwise, 1-based index in the list of include directories
+            uint64_t directoryIndex;
+        };
+        // Read one FileName object, remove_prefix sp
+        static bool readFileName(std::string_view & sp, FileName & fn);
+
+        // Get file name at given index; may be in the initial table
+        // (fileNames_) or defined using DW_LNE_define_file (and we reexecute
+        // enough of the program to find it, if so)
+        FileName getFileName(uint64_t index) const;
+
+        // Get include directory at given index
+        std::string_view getIncludeDirectory(uint64_t index) const;
+
+        // Execute opcodes until finding a DW_LNE_define_file and return true;
+        // return file at the end.
+        bool nextDefineFile(std::string_view & program, FileName & fn) const;
+
+        // Initialization
+        bool is64Bit_;
+        std::string_view data_;
+        std::string_view compilationDirectory_;
+
+        // Header
+        uint16_t version_;
+        uint8_t minLength_;
+        bool defaultIsStmt_;
+        int8_t lineBase_;
+        uint8_t lineRange_;
+        uint8_t opcodeBase_;
+        const uint8_t * standardOpcodeLengths_;
+
+        std::string_view includeDirectories_;
+        size_t includeDirectoryCount_;
+
+        std::string_view fileNames_;
+        size_t fileNameCount_;
+
+        // State machine registers
+        uint64_t address_;
+        uint64_t file_;
+        uint64_t line_;
+        uint64_t column_;
+        bool isStmt_;
+        bool basicBlock_;
+        bool endSequence_;
+        bool prologueEnd_;
+        bool epilogueBegin_;
+        uint64_t isa_;
+        uint64_t discriminator_;
+    };
+
+    // Read an abbreviation from a std::string_view, return true if at end; remove_prefix sp
+    static bool readAbbreviation(std::string_view & sp, DIEAbbreviation & abbr);
+
+    // Get abbreviation corresponding to a code, in the chunk starting at
+    // offset in the .debug_abbrev section
+    DIEAbbreviation getAbbreviation(uint64_t code, uint64_t offset) const;
+
+    // Read one attribute <name, form> pair, remove_prefix sp; returns <0, 0> at end.
+    static DIEAbbreviation::Attribute readAttribute(std::string_view & sp);
+
+    // Read one attribute value, remove_prefix sp
+    typedef std::variant<uint64_t, std::string_view> AttributeValue;
+    AttributeValue readAttributeValue(std::string_view & sp, uint64_t form, bool is64Bit) const;
+
+    // Get an ELF section by name, return true if found
+    bool getSection(const char * name, std::string_view * section) const;
+
+    // Get a string from the .debug_str section
+    std::string_view getStringFromStringSection(uint64_t offset) const;
+
+    std::string_view info_; // .debug_info
+    std::string_view abbrev_; // .debug_abbrev
+    std::string_view aranges_; // .debug_aranges
+    std::string_view line_; // .debug_line
+    std::string_view strings_; // .debug_str
+};
+
+}
--- a/dbms/src/Common/Elf.cpp
+++ b/dbms/src/Common/Elf.cpp
 #include <Common/Elf.h>
 #include <Common/Exception.h>

-#include <iostream>
+#include <string.h>


 namespace DB
@@ -16,8 +16,6 @@ namespace ErrorCodes
 Elf::Elf(const std::string & path)
    : in(path, 0)
 {
-    std::cerr << "Processing path " << path << "\n";
-
    /// Check if it's an elf.
    size = in.buffer().size();
    if (size < sizeof(ElfEhdr))
@@ -98,6 +96,12 @@ std::optional<Elf::Section> Elf::findSection(std::function<bool(const Section &
 }


+std::optional<Elf::Section> Elf::findSectionByName(const char * name) const
+{
+    return findSection([&](const Section & section, size_t) { return 0 == strcmp(name, section.name()); });
+}
+
+
 const char * Elf::Section::name() const
 {
    if (!elf.section_names)
@@ -115,7 +119,12 @@ const char * Elf::Section::begin() const

 const char * Elf::Section::end() const
 {
-    return elf.mapped + header.sh_offset + header.sh_size;
+    return begin() + size();
+}
+
+size_t Elf::Section::size() const
+{
+    return header.sh_size;
 }

 }
--- a/dbms/src/Common/Elf.h
+++ b/dbms/src/Common/Elf.h
@@ -21,7 +21,9 @@ using ElfSym = ElfW(Sym);
 namespace DB
 {

-class Elf
+/** Allow to navigate sections in ELF.
+  */
+class Elf final
 {
 public:
    struct Section
@@ -31,6 +33,7 @@ public:

        const char * begin() const;
        const char * end() const;
+        size_t size() const;

        Section(const ElfShdr & header, const Elf & elf);

@@ -40,8 +43,9 @@ public:

    Elf(const std::string & path);

-    std::optional<Section> findSection(std::function<bool(const Section & section, size_t idx)> && pred) const;
    bool iterateSections(std::function<bool(const Section & section, size_t idx)> && pred) const;
+    std::optional<Section> findSection(std::function<bool(const Section & section, size_t idx)> && pred) const;
+    std::optional<Section> findSectionByName(const char * name) const;

    const char * end() const { return mapped + size; }


--- a/dbms/src/Common/ErrorCodes.cpp
+++ b/dbms/src/Common/ErrorCodes.cpp
@@ -439,6 +439,7 @@ namespace ErrorCodes
    extern const int CANNOT_DELETE_TIMER = 462;
    extern const int CANNOT_FCNTL = 463;
    extern const int CANNOT_PARSE_ELF = 464;
+    extern const int CANNOT_PARSE_DWARF = 465;

    extern const int KEEPER_EXCEPTION = 999;
    extern const int POCO_EXCEPTION = 1000;