diff --git a/gpAux/extensions/gps3ext/include/gpwriter.h b/gpAux/extensions/gps3ext/include/gpwriter.h index 5d678df6e2f31691b0199a6e0fca83a9f48dbc80..5fc83e285f2167a0be94a595702855d6a951a1e3 100644 --- a/gpAux/extensions/gps3ext/include/gpwriter.h +++ b/gpAux/extensions/gps3ext/include/gpwriter.h @@ -54,34 +54,4 @@ GPWriter *writer_init(const char *url_with_options); bool writer_transfer_data(GPWriter *writer, char *data_buf, int &data_len); bool writer_cleanup(GPWriter **writer); -// Set schema to 'https' or 'http' -inline string replaceSchemaFromURL(const string &url) { - size_t iend = url.find("://"); - if (iend == string::npos) { - return url; - } - return "https" + url.substr(iend); -} - -inline string getRegionFromURL(const string &url) { - size_t ibegin = - url.find("://s3") + strlen("://s3"); // index of character('.' or '-') after "3" - size_t iend = url.find(".amazonaws.com"); - - string region; - if (iend == string::npos) { - return region; - } else if (ibegin == iend) { // "s3.amazonaws.com" - return "external-1"; - } else { - // ibegin + 1 is the character after "s3." or "s3-" - // for instance: s3-us-west-2.amazonaws.com - region = url.substr(ibegin + 1, iend - (ibegin + 1)); - } - - if (region.compare("us-east-1") == 0) { - region = "external-1"; - } - return region; -} #endif /* INCLUDE_GPWRITER_H_ */ diff --git a/gpAux/extensions/gps3ext/include/makefile.inc b/gpAux/extensions/gps3ext/include/makefile.inc index 6206406c259a32956a7fecc449906f07b33c0762..a40cb38ca1fac3e35bbd4e2922c5986d5a61f81a 100644 --- a/gpAux/extensions/gps3ext/include/makefile.inc +++ b/gpAux/extensions/gps3ext/include/makefile.inc @@ -1,4 +1,4 @@ -COMMON_OBJS = gpreader.o gpwriter.o s3conf.o s3common.o s3utils.o s3log.o s3url_parser.o s3http_headers.o s3interface.o s3restful_service.o decompress_reader.o s3key_reader.o s3key_writer.o s3bucket_reader.o s3common_reader.o +COMMON_OBJS = gpreader.o gpwriter.o s3conf.o s3common.o s3utils.o s3log.o s3url.o s3http_headers.o s3interface.o s3restful_service.o decompress_reader.o s3key_reader.o s3key_writer.o s3bucket_reader.o s3common_reader.o COMMON_LINK_OPTIONS = -lstdc++ -lxml2 -lpthread -lcrypto -lcurl -lz diff --git a/gpAux/extensions/gps3ext/include/s3bucket_reader.h b/gpAux/extensions/gps3ext/include/s3bucket_reader.h index c22c15e1030561e3e75ca22df3d148ef64a67e4a..0475a60d77d9563cd0736667eef675394c0fcd94 100644 --- a/gpAux/extensions/gps3ext/include/s3bucket_reader.h +++ b/gpAux/extensions/gps3ext/include/s3bucket_reader.h @@ -73,9 +73,6 @@ class S3BucketReader : public Reader { ListBucketResult keyList; // List of matched keys/files. uint64_t keyIndex; // BucketContent index of keylist->contents. - void SetSchema(); - void SetRegion(); - void SetBucketAndPrefix(); BucketContent *getNextKey(); ReaderParams getReaderParams(BucketContent *key); }; diff --git a/gpAux/extensions/gps3ext/include/s3key_reader.h b/gpAux/extensions/gps3ext/include/s3key_reader.h index beb90dac700a6e4a643bcfcb3407dbf0ec70491a..03b61fb9346ec6e3c43595371a5534e2bb06b4bb 100644 --- a/gpAux/extensions/gps3ext/include/s3key_reader.h +++ b/gpAux/extensions/gps3ext/include/s3key_reader.h @@ -9,7 +9,7 @@ using std::string; #include "s3interface.h" #include "s3macros.h" #include "s3restful_service.h" -#include "s3url_parser.h" +#include "s3url.h" #include diff --git a/gpAux/extensions/gps3ext/include/s3key_writer.h b/gpAux/extensions/gps3ext/include/s3key_writer.h index a57b1ceee139ef4f0759a3e03ba9298934c61ea5..d0bff27ff287e7219cf64b1a342cf8e2a1f08e3b 100644 --- a/gpAux/extensions/gps3ext/include/s3key_writer.h +++ b/gpAux/extensions/gps3ext/include/s3key_writer.h @@ -5,7 +5,7 @@ #include "s3interface.h" #include "s3macros.h" #include "s3restful_service.h" -#include "s3url_parser.h" +#include "s3url.h" #include "writer.h" #include diff --git a/gpAux/extensions/gps3ext/include/s3url_parser.h b/gpAux/extensions/gps3ext/include/s3url.h similarity index 59% rename from gpAux/extensions/gps3ext/include/s3url_parser.h rename to gpAux/extensions/gps3ext/include/s3url.h index 31ed1783e3e8682607c4b3c95c5182df69c5430f..ae54767d95eb9aa6a707e1b707e7c534a423d885 100644 --- a/gpAux/extensions/gps3ext/include/s3url_parser.h +++ b/gpAux/extensions/gps3ext/include/s3url.h @@ -1,5 +1,5 @@ -#ifndef __S3_URL_PARSER_H__ -#define __S3_URL_PARSER_H__ +#ifndef __S3_URL_H__ +#define __S3_URL_H__ #include "http_parser.h" @@ -34,4 +34,17 @@ class UrlParser { string fullurl; }; +class S3UrlUtility { + public: + // Set schema to 'https' or 'http' + static string replaceSchemaFromURL(const string &url, bool useHttps = true); + + static string getDefaultSchema(bool useHttps = true); + + static string getRegionFromURL(const string &url); + + static string getBucketFromURL(const string &url); + + static string getPrefixFromURL(const string &url); +}; #endif diff --git a/gpAux/extensions/gps3ext/src/gpwriter.cpp b/gpAux/extensions/gps3ext/src/gpwriter.cpp index 0c99a0854ae76f93b09568848d16ef78d4fea4cd..10556d2d5ada249d37c86ca9dea352c1116bdba2 100644 --- a/gpAux/extensions/gps3ext/src/gpwriter.cpp +++ b/gpAux/extensions/gps3ext/src/gpwriter.cpp @@ -12,13 +12,14 @@ #include "s3conf.h" #include "s3log.h" #include "s3macros.h" +#include "s3url.h" #include "s3utils.h" using std::string; using std::stringstream; GPWriter::GPWriter(const string& url) { - string file = replaceSchemaFromURL(url); + string file = S3UrlUtility::replaceSchemaFromURL(url, s3ext_encryption); constructWriterParams(file); restfulServicePtr = &restfulService; } @@ -29,7 +30,7 @@ void GPWriter::constructWriterParams(const string& url) { this->params.setSegNum(s3ext_segnum); this->params.setNumOfChunks(s3ext_threadnum); this->params.setChunkSize(s3ext_chunksize); - this->params.setRegion(getRegionFromURL(url)); + this->params.setRegion(S3UrlUtility::getRegionFromURL(url)); this->cred.accessID = s3ext_accessid; this->cred.secret = s3ext_secret; diff --git a/gpAux/extensions/gps3ext/src/s3bucket_reader.cpp b/gpAux/extensions/gps3ext/src/s3bucket_reader.cpp index 8bffad3b2c308619416967c4cbfd7146e3174293..5d302090d702909c7b8debbb75fc4b0f65a56739 100644 --- a/gpAux/extensions/gps3ext/src/s3bucket_reader.cpp +++ b/gpAux/extensions/gps3ext/src/s3bucket_reader.cpp @@ -11,6 +11,7 @@ #include "s3log.h" #include "s3macros.h" #include "s3params.h" +#include "s3url.h" #include "s3utils.h" using std::string; @@ -111,19 +112,6 @@ void S3BucketReader::close() { return; } -// Set schema to 'https' or 'http' -void S3BucketReader::SetSchema() { - size_t iend = this->url.find("://"); - if (iend == string::npos) { - return; - } - - this->schema = this->url.substr(0, iend); - if (this->schema == "s3") { - this->schema = s3ext_encryption ? "https" : "http"; - } -} - string S3BucketReader::getKeyURL(const string &key) { stringstream sstr; sstr << this->schema << "://" @@ -132,60 +120,11 @@ string S3BucketReader::getKeyURL(const string &key) { return sstr.str(); } -// Set AWS region, use 'external-1' if it is 'us-east-1' or not present -// http://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region -void S3BucketReader::SetRegion() { - size_t ibegin = - this->url.find("://s3") + strlen("://s3"); // index of character('.' or '-') after "3" - size_t iend = this->url.find(".amazonaws.com"); - - if (iend == string::npos) { - return; - } else if (ibegin == iend) { // "s3.amazonaws.com" - this->region = "external-1"; - } else { - // ibegin + 1 is the character after "s3." or "s3-" - // for instance: s3-us-west-2.amazonaws.com - this->region = this->url.substr(ibegin + 1, iend - (ibegin + 1)); - } - - if (this->region.compare("us-east-1") == 0) { - this->region = "external-1"; - } -} - -void S3BucketReader::SetBucketAndPrefix() { - size_t ibegin = find_Nth(this->url, 3, "/"); - size_t iend = find_Nth(this->url, 4, "/"); - - if (ibegin == string::npos) { - return; - } - // s3://s3-region.amazonaws.com/bucket - if (iend == string::npos) { - this->bucket = url.substr(ibegin + 1, url.length() - ibegin - 1); - this->prefix = ""; - return; - } - - this->bucket = url.substr(ibegin + 1, iend - ibegin - 1); - - // s3://s3-region.amazonaws.com/bucket/ - if (iend == url.length() - 1) { - this->prefix = ""; - return; - } - - ibegin = find_Nth(url, 4, "/"); - // s3://s3-region.amazonaws.com/bucket/prefix - // s3://s3-region.amazonaws.com/bucket/prefix/whatever - this->prefix = url.substr(ibegin + 1, url.length() - ibegin - 1); -} - void S3BucketReader::parseURL() { - this->SetSchema(); - this->SetRegion(); - this->SetBucketAndPrefix(); + this->schema = s3ext_encryption ? "https" : "http"; + this->region = S3UrlUtility::getRegionFromURL(this->url); + this->bucket = S3UrlUtility::getBucketFromURL(this->url); + this->prefix = S3UrlUtility::getPrefixFromURL(this->url); bool ok = !(this->schema.empty() || this->region.empty() || this->bucket.empty()); CHECK_OR_DIE_MSG(ok, "'%s' is not valid", this->url.c_str()); diff --git a/gpAux/extensions/gps3ext/src/s3interface.cpp b/gpAux/extensions/gps3ext/src/s3interface.cpp index 6510fdc6641113b1cdbe9d651b639a138931551f..140804f5f0149312affabb7b520cc66a66feae02 100644 --- a/gpAux/extensions/gps3ext/src/s3interface.cpp +++ b/gpAux/extensions/gps3ext/src/s3interface.cpp @@ -13,7 +13,7 @@ #include "s3key_reader.h" #include "s3log.h" #include "s3macros.h" -#include "s3url_parser.h" +#include "s3url.h" #include "s3utils.h" #include "s3interface.h" diff --git a/gpAux/extensions/gps3ext/src/s3url.cpp b/gpAux/extensions/gps3ext/src/s3url.cpp new file mode 100644 index 0000000000000000000000000000000000000000..379b7150dd9bc68f21d97872fae9bd6ebc7861fa --- /dev/null +++ b/gpAux/extensions/gps3ext/src/s3url.cpp @@ -0,0 +1,113 @@ +#include +#include +#include +#include + +#include +#include + +#include "http_parser.h" + +#include "s3log.h" +#include "s3macros.h" +#include "s3url.h" +#include "s3utils.h" + +using std::string; +using std::stringstream; + +UrlParser::UrlParser(const string &url) { + CHECK_ARG_OR_DIE(!url.empty()); + + this->fullurl = url; + + struct http_parser_url url_parser; + int result = + http_parser_parse_url(this->fullurl.c_str(), this->fullurl.length(), false, &url_parser); + CHECK_OR_DIE_MSG(result == 0, "Failed to parse URL %s at field %d", this->fullurl.c_str(), + result); + + this->schema = extractField(&url_parser, UF_SCHEMA); + this->host = extractField(&url_parser, UF_HOST); + this->path = extractField(&url_parser, UF_PATH); + this->query = extractField(&url_parser, UF_QUERY); +} + +string UrlParser::extractField(const struct http_parser_url *url_parser, http_parser_url_fields i) { + if ((url_parser->field_set & (1 << i)) == 0) { + return ""; + } + + return this->fullurl.substr(url_parser->field_data[i].off, url_parser->field_data[i].len); +} + +string S3UrlUtility::replaceSchemaFromURL(const string &url, bool useHttps) { + size_t iend = url.find("://"); + if (iend == string::npos) { + return url; + } + + return getDefaultSchema(useHttps) + url.substr(iend); +} + +string S3UrlUtility::getDefaultSchema(bool useHttps) { + return useHttps ? "https" : "http"; +} + +// Set AWS region, use 'external-1' if it is 'us-east-1' or not present +// http://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region +string S3UrlUtility::getRegionFromURL(const string &url) { + size_t ibegin = + url.find("://s3") + strlen("://s3"); // index of character('.' or '-') after "3" + size_t iend = url.find(".amazonaws.com"); + + string region; + if (iend == string::npos) { + return region; + } else if (ibegin == iend) { // "s3.amazonaws.com" + return "external-1"; + } else { + // ibegin + 1 is the character after "s3." or "s3-" + // for instance: s3-us-west-2.amazonaws.com + region = url.substr(ibegin + 1, iend - (ibegin + 1)); + } + + if (region.compare("us-east-1") == 0) { + region = "external-1"; + } + return region; +} + +string S3UrlUtility::getBucketFromURL(const string &url) { + size_t ibegin = find_Nth(url, 3, "/"); + size_t iend = find_Nth(url, 4, "/"); + if (ibegin == string::npos) { + return string(); + } + // s3://s3-region.amazonaws.com/bucket + if (iend == string::npos) { + return url.substr(ibegin + 1, url.length() - ibegin - 1); + } + + return url.substr(ibegin + 1, iend - ibegin - 1); +} + +string S3UrlUtility::getPrefixFromURL(const string &url) { + size_t ibegin = find_Nth(url, 3, "/"); + size_t iend = find_Nth(url, 4, "/"); + + // s3://s3-region.amazonaws.com/bucket + if (ibegin == string::npos || iend == string::npos) { + return string(); + } + + // s3://s3-region.amazonaws.com/bucket/ + if (iend == url.length() - 1) { + return string(); + } + + ibegin = find_Nth(url, 4, "/"); + // s3://s3-region.amazonaws.com/bucket/prefix + // s3://s3-region.amazonaws.com/bucket/prefix/whatever + return url.substr(ibegin + 1, url.length() - ibegin - 1); +} \ No newline at end of file diff --git a/gpAux/extensions/gps3ext/src/s3url_parser.cpp b/gpAux/extensions/gps3ext/src/s3url_parser.cpp deleted file mode 100644 index aa30c6585762b5c8273d9ae611f9d5a31c227eb6..0000000000000000000000000000000000000000 --- a/gpAux/extensions/gps3ext/src/s3url_parser.cpp +++ /dev/null @@ -1,41 +0,0 @@ -#include -#include -#include -#include - -#include -#include - -#include "http_parser.h" - -#include "s3log.h" -#include "s3macros.h" -#include "s3url_parser.h" - -using std::string; -using std::stringstream; - -UrlParser::UrlParser(const string &url) { - CHECK_ARG_OR_DIE(!url.empty()); - - this->fullurl = url; - - struct http_parser_url url_parser; - int result = - http_parser_parse_url(this->fullurl.c_str(), this->fullurl.length(), false, &url_parser); - CHECK_OR_DIE_MSG(result == 0, "Failed to parse URL %s at field %d", this->fullurl.c_str(), - result); - - this->schema = extractField(&url_parser, UF_SCHEMA); - this->host = extractField(&url_parser, UF_HOST); - this->path = extractField(&url_parser, UF_PATH); - this->query = extractField(&url_parser, UF_QUERY); -} - -string UrlParser::extractField(const struct http_parser_url *url_parser, http_parser_url_fields i) { - if ((url_parser->field_set & (1 << i)) == 0) { - return ""; - } - - return this->fullurl.substr(url_parser->field_data[i].off, url_parser->field_data[i].len); -} \ No newline at end of file diff --git a/gpAux/extensions/gps3ext/test/s3bucket_reader_test.cpp b/gpAux/extensions/gps3ext/test/s3bucket_reader_test.cpp index 2c7a6439cb9e67684235c95d2769dd3c75bb193d..6fe560640da551a69c4e3b99cd2774e76c314a8e 100644 --- a/gpAux/extensions/gps3ext/test/s3bucket_reader_test.cpp +++ b/gpAux/extensions/gps3ext/test/s3bucket_reader_test.cpp @@ -74,6 +74,12 @@ TEST_F(S3BucketReaderTest, ParseURL_normal) { EXPECT_EQ("dataset1/normal", this->bucketReader->getPrefix()); } +//cannot find '://', so return url itself +TEST_F(S3BucketReaderTest, ParseURL_noSchema) { + string url = "abcd"; + EXPECT_EQ(url, S3UrlUtility::replaceSchemaFromURL(url)); +} + TEST_F(S3BucketReaderTest, ParseURL_NoPrefixAndSlash) { EXPECT_NO_THROW( this->bucketReader->parseURL("s3://s3-us-west-2.amazonaws.com/s3test.pivotal.io")); diff --git a/gpAux/extensions/gps3ext/test/s3url_parser_test.cpp b/gpAux/extensions/gps3ext/test/s3url_test.cpp similarity index 97% rename from gpAux/extensions/gps3ext/test/s3url_parser_test.cpp rename to gpAux/extensions/gps3ext/test/s3url_test.cpp index 1d936bb285ae73062500b3d583d4de1bcfd40df2..cd2dce8146dac191383111c0ceec45a65f8c9d62 100644 --- a/gpAux/extensions/gps3ext/test/s3url_parser_test.cpp +++ b/gpAux/extensions/gps3ext/test/s3url_test.cpp @@ -1,4 +1,4 @@ -#include "s3url_parser.cpp" +#include "s3url.cpp" #include "gtest/gtest.h" #include "s3macros.h"