diff --git a/.gitignore b/.gitignore
index 33b9a727e2a8c6707c151a4bb47b3b78d1127693..7b74e6619f3c99ee19eb44a364ed249e09bafeb4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@
.DS_Store
__pycache__
*.pyc
+output
diff --git a/README.md b/README.md
index 3d0b3184b0bf11e3a7487c6167f996f90f20a992..a3f87f0a7b1e91d3005dc60836f294ebe27f18c4 100644
--- a/README.md
+++ b/README.md
@@ -24,3 +24,63 @@
1. 斯坦福大学的评测:AlpacaEval Logo Leaderboard
2.
3.
+4. https://github.com/the-crypt-keeper/can-ai-code
+5. https://github.com/THUDM/CodeGeeX/tree/main/codegeex/benchmark
+6. https://github.com/openai/human-eval
+
+
+## HumanEval-X
+HumanEval-X中每个语言的样本,包含了声明、描述和解答,它们之间的组合可以支持不同的下游任务,包括生成、翻译、概括等。我们目前关注两个任务:代码生成与代码翻译。对于代码生成任务,模型将函数声明与文档字符串作为输入,输出函数实现;对于代码翻译任务,模型将两种语言的函数声明与源语言的实现作为输入,输出目标语言上的实现。我们在代码翻译任务中不将文档字符串输入模型,以避免模型直接通过描述生成答案。在两种任务下,我们都采用Codex所使用的无偏pass@k指标:
+pass@k:=𝔼[1−(n-ck)/(nk)], n=200, k∈(1,10,100)。
+
+样本使用JSON列表格式存储在codegeex/benchmark/humaneval-x/[LANG]/data/humaneval_[LANG].jsonl.gz,每条样本包含6个部分:
+
+task_id: 题目的目标语言与ID。语言为["Python", "Java", "JavaScript", "CPP", "Go"]中之一。
+prompt: 函数声明与描述,用于代码生成。
+declaration: 仅有函数声明,用于代码翻译。
+canonical_solution: 手写的示例解答。
+test: 隐藏测例,用于评测。
+example_test: 提示中出现的公开测例,用于评测。
+
+评测生成的代码需要使用多种语言编译、运行。我们使用的各编程语言依赖及所用包的版本如下:
+
+| 依赖 | 版本 |
+| ---- | ---- |
+| Python | 3.8.12 |
+| JDK | 18.0.2.1 |
+| Node.js | 16.14.0 |
+| js-md5 | 0.7.3 |
+| C++ | 11 |
+| g++ | 7.5.0 |
+| Boost | 1.71.0 |
+| OpenSSL | 3.0.0 |
+| go | 1.18.4 |
+
+
+
+## 我们的工作
+
+1、基于清华的HumanEval-X,进行了集成,修改了代码生成任务的结构;
+2、多模型配置,可以配置模型参数,以及是调取接口还是本地推理;
+3、优化了代码块方法体抽取的逻辑;
+4、目前适配了java、python、cpp、js和go等五种语言。
+
+
+
+## 测试结果
+
+受限于模型推理速度,目前测试了pass@1指标。
+
+| | python | java | cpp | js | go |
+|-------------|--------|--------|--------|--------|---------|
+| chatgpt | 64.02% | 15.85% | 26.22% | 47.00% | 31.70% |
+| bbt-7B | 0.61% | 1.83% | 1.22% | 1.83% | 0% |
+| chatglm2-7B | 7.93% | 5.45% | 0.61% | 6.70% | 1.83% |
+
+
+
+
+## TODO
+1、测试更多开源模型,例如百川,llama2,rwkv。
+2、测试模型的pass@10和pass@100指标。
+3、代码翻译类任务还没有适配,同时也需要构造相关的数据。
diff --git a/eval_set/humaneval-x/cpp/data/humaneval_cpp.jsonl b/eval_set/humaneval-x/cpp/data/humaneval_cpp.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4c4ce46302c3dcc1e40e517f14b3407fce16053d
--- /dev/null
+++ b/eval_set/humaneval-x/cpp/data/humaneval_cpp.jsonl
@@ -0,0 +1,164 @@
+{"task_id": "CPP/0", "prompt": "/*\nCheck if in given vector of numbers, are any two numbers closer to each other than\ngiven threshold.\n>>> has_close_elements({1.0, 2.0, 3.0}, 0.5)\nfalse\n>>> has_close_elements({1.0, 2.8, 3.0, 4.0, 5.0, 2.0}, 0.3)\ntrue\n*/\n#include\n#include\n#include\nusing namespace std;\nbool has_close_elements(vector numbers, float threshold){\n", "canonical_solution": " int i,j;\n \n for (i=0;i\nint main(){\n vector a={1.0, 2.0, 3.9, 4.0, 5.0, 2.2};\n assert (has_close_elements(a, 0.3)==true);\n assert (has_close_elements(a, 0.05) == false);\n\n assert (has_close_elements({1.0, 2.0, 5.9, 4.0, 5.0}, 0.95) == true);\n assert (has_close_elements({1.0, 2.0, 5.9, 4.0, 5.0}, 0.8) ==false);\n assert (has_close_elements({1.0, 2.0, 3.0, 4.0, 5.0}, 2.0) == true);\n assert (has_close_elements({1.1, 2.2, 3.1, 4.1, 5.1}, 1.0) == true);\n assert (has_close_elements({1.1, 2.2, 3.1, 4.1, 5.1}, 0.5) == false);\n \n}\n", "declaration": "#include\n#include\n#include\nusing namespace std;\n#include\n#include\nbool has_close_elements(vector numbers, float threshold){\n", "example_test": "#undef NDEBUG\n#include\nint main(){\n assert (has_close_elements({1.0, 2.0, 3.0}, 0.5) == false && \"failure 1\");\n assert (has_close_elements({1.0, 2.8, 3.0, 4.0, 5.0, 2.0}, 0.3) && \"failure 2\") ;\n}\n"}
+{"task_id": "CPP/1", "prompt": "/*\nInput to this function is a string containing multiple groups of nested parentheses. Your goal is to\nseparate those group into separate strings and return the vector of those.\nSeparate groups are balanced (each open brace is properly closed) and not nested within each other\nIgnore any spaces in the input string.\n>>> separate_paren_groups(\"( ) (( )) (( )( ))\")\n{\"()\", \"(())\", \"(()())\"}\n*/\n#include\n#include\n#include\nusing namespace std;\nvector separate_paren_groups(string paren_string){\n", "canonical_solution": " vector all_parens;\n string current_paren;\n int level=0;\n char chr;\n int i;\n for (i=0;i\nbool issame(vector a,vectorb){\n if (a.size()!=b.size()) return false;\n for (int i=0;i\n#include\n#include\nusing namespace std;\n#include\n#include\n#include\nvector separate_paren_groups(string paren_string){\n", "example_test": "#undef NDEBUG\n#include\nbool issame(vector a,vectorb){\n if (a.size()!=b.size()) return false;\n for (int i=0;i>> truncate_number(3.5)\n0.5\n*/\n#include\n#include\nusing namespace std;\nfloat truncate_number(float number){\n", "canonical_solution": " return number-int(number);\n}\n", "test": "#undef NDEBUG\n#include\nint main(){\n assert (truncate_number(3.5) == 0.5); \n assert (abs(truncate_number(1.33) - 0.33) < 1e-4);\n assert (abs(truncate_number(123.456) - 0.456) < 1e-4);\n}", "declaration": "#include\n#include\nusing namespace std;\n#include\n#include\nfloat truncate_number(float number){\n", "example_test": "#undef NDEBUG\n#include\nint main(){\n assert (truncate_number(3.5) == 0.5); \n}\n"}
+{"task_id": "CPP/3", "prompt": "/*\nYou\"re given a vector of deposit and withdrawal operations on a bank account that starts with\nzero balance. Your task is to detect if at any point the balance of account falls below zero, and\nat that point function should return true. Otherwise it should return false.\n>>> below_zero({1, 2, 3})\nfalse\n>>> below_zero({1, 2, -4, 5})\ntrue\n*/\n#include\n#include\nusing namespace std;\nbool below_zero(vector operations){\n", "canonical_solution": " int num=0;\n for (int i=0;i\nint main(){\n assert (below_zero({}) == false);\n assert (below_zero({1, 2, -3, 1, 2, -3}) == false);\n assert (below_zero({1, 2, -4, 5, 6}) == true);\n assert (below_zero({1, -1, 2, -2, 5, -5, 4, -4}) == false);\n assert (below_zero({1, -1, 2, -2, 5, -5, 4, -5}) == true);\n assert (below_zero({1, -2, 2, -2, 5, -5, 4, -4}) == true);\n}", "declaration": "#include\n#include\nusing namespace std;\n#include\n#include\n#include\nbool below_zero(vector operations){\n", "example_test": "#undef NDEBUG\n#include\nint main(){\n assert (below_zero({1, 2, 3}) == false);\n assert (below_zero({1, 2, -4, 5}) == true);\n}\n"}
+{"task_id": "CPP/4", "prompt": "/*\nFor a given vector of input numbers, calculate Mean Absolute Deviation\naround the mean of this dataset.\nMean Absolute Deviation is the average absolute difference between each\nelement and a centerpoint (mean in this case):\nMAD = average | x - x_mean |\n>>> mean_absolute_deviation({1.0, 2.0, 3.0, 4.0})\n1.0\n*/\n#include\n#include\n#include\nusing namespace std;\nfloat mean_absolute_deviation(vector numbers){\n", "canonical_solution": " float sum=0;\n float avg,msum,mavg;\n int i=0;\n for (i=0;i\nint main(){\n assert (abs(mean_absolute_deviation({1.0, 2.0, 3.0}) - 2.0/3.0) < 1e-4);\n assert (abs(mean_absolute_deviation({1.0, 2.0, 3.0, 4.0}) - 1.0) < 1e-4);\n assert (abs(mean_absolute_deviation({1.0, 2.0, 3.0, 4.0, 5.0}) - 6.0/5.0) < 1e-4);\n}", "declaration": "#include\n#include\n#include\nusing namespace std;\n#include\n#include\nfloat mean_absolute_deviation(vector numbers){\n", "example_test": "#undef NDEBUG\n#include\nint main(){\n assert (abs(mean_absolute_deviation({1.0, 2.0, 3.0, 4.0}) - 1.0) < 1e-4);\n}\n"}
+{"task_id": "CPP/5", "prompt": "/*\nInsert a number \"delimeter\" between every two consecutive elements of input vector `numbers\"\n>>> intersperse({}, 4)\n{}\n>>> intersperse({1, 2, 3}, 4)\n{1, 4, 2, 4, 3}\n*/\n#include\n#include\nusing namespace std;\nvector intersperse(vector numbers, int delimeter){ \n", "canonical_solution": " vector out={};\n if (numbers.size()>0) out.push_back(numbers[0]);\n for (int i=1;i\nbool issame(vector a,vectorb){\n if (a.size()!=b.size()) return false;\n for (int i=0;i\n#include\nusing namespace std;\n#include\n#include\n#include\nvector intersperse(vector numbers, int delimeter){ \n", "example_test": "#undef NDEBUG\n#include\nbool issame(vector a,vectorb){\n if (a.size()!=b.size()) return false;\n for (int i=0;i>> parse_nested_parens(\"(()()) ((())) () ((())()())\")\n{2, 3, 1, 3}\n*/\n#include\n#include\n#include\nusing namespace std;\nvector parse_nested_parens(string paren_string){\n", "canonical_solution": " vector all_levels;\n string current_paren;\n int level=0,max_level=0;\n char chr;\n int i;\n for (i=0;imax_level) max_level=level;\n current_paren+=chr;\n }\n if (chr==')')\n {\n level-=1;\n current_paren+=chr;\n if (level==0){\n all_levels.push_back(max_level);\n current_paren=\"\";\n max_level=0;\n }\n }\n }\n return all_levels;\n}\n", "test": "#undef NDEBUG\n#include\nbool issame(vector a,vectorb){\n if (a.size()!=b.size()) return false;\n for (int i=0;i\n#include\n#include\nusing namespace std;\n#include\n#include\n#include\nvector parse_nested_parens(string paren_string){\n", "example_test": "#undef NDEBUG\n#include\nbool issame(vector a,vectorb){\n if (a.size()!=b.size()) return false;\n for (int i=0;i>> filter_by_substring({}, \"a\")\n{}\n>>> filter_by_substring({\"abc\", \"bacd\", \"cde\", \"vector\"}, \"a\")\n{\"abc\", \"bacd\", \"vector\"}\n*/\n#include\n#include\n#include\nusing namespace std;\nvector filter_by_substring(vector strings, string substring){\n", "canonical_solution": " vector out;\n for (int i=0;i\nbool issame(vector a,vectorb){\n if (a.size()!=b.size()) return false;\n for (int i=0;i\n#include\n#include\nusing namespace std;\n#include\n#include\n#include\nvector filter_by_substring(vector strings, string substring){\n", "example_test": "#undef NDEBUG\n#include\nbool issame(vector a,vectorb){\n if (a.size()!=b.size()) return false;\n for (int i=0;i>> sum_product({})\n(0, 1)\n>>> sum_product({1, 2, 3, 4})\n(10, 24)\n*/\n#include\n#include\nusing namespace std;\nvector sum_product(vector numbers){\n", "canonical_solution": " int sum=0,product=1;\n for (int i=0;i\nbool issame(vector a,vectorb){\n if (a.size()!=b.size()) return false;\n for (int i=0;i\n#include\nusing namespace std;\n#include\n#include\n#include\nvector sum_product(vector numbers){\n", "example_test": "#undef NDEBUG\n#include\nbool issame(vector a,vectorb){\n if (a.size()!=b.size()) return false;\n for (int i=0;i>> rolling_max({1, 2, 3, 2, 3, 4, 2})\n{1, 2, 3, 3, 3, 4, 4}\n*/\n#include\n#include\nusing namespace std;\nvector rolling_max(vector numbers){\n", "canonical_solution": " vector out;\n int max=0;\n for (int i=0;imax) max=numbers[i];\n out.push_back(max);\n }\n return out;\n}\n", "test": "#undef NDEBUG\n#include\nbool issame(vector a,vectorb){\n if (a.size()!=b.size()) return false;\n for (int i=0;i\n#include\nusing namespace std;\n#include\n#include\n#include\nvector rolling_max(vector numbers){\n", "example_test": "#undef NDEBUG\n#include\nbool issame(vector a,vectorb){\n if (a.size()!=b.size()) return false;\n for (int i=0;i\n#include\nusing namespace std;\nbool is_palindrome(string str){\n //Test if given string is a palindrome \n string s(str.rbegin(),str.rend());\n return s==str;\n}\nstring make_palindrome(string str){\n /*\n Find the shortest palindrome that begins with a supplied string. \n Algorithm idea is simple: - Find the longest postfix of supplied string that is a palindrome. \n - Append to the end of the string reverse of a string prefix that comes before the palindromic suffix.\n >>> make_palindrome(\"\") \n \"\" \n >>> make_palindrome(\"cat\") \n \"catac\" \n >>> make_palindrome(\"cata\") \n \"catac\" \n */\n", "canonical_solution": " int i;\n for (i=0;i\nint main(){\n assert (make_palindrome(\"\") == \"\");\n assert (make_palindrome(\"x\") == \"x\");\n assert (make_palindrome(\"xyz\") == \"xyzyx\");\n assert (make_palindrome(\"xyx\") == \"xyx\") ;\n assert (make_palindrome(\"jerry\") == \"jerryrrej\");\n}\n\n\n", "declaration": "#include\n#include\nusing namespace std;\n#include\n#include\n#include\nbool is_palindrome(string str){\n string s(str.rbegin(),str.rend());\n return s==str;\n}\nstring make_palindrome(string str){\n", "example_test": "#undef NDEBUG\n#include\nint main(){\n assert (make_palindrome(\"\") == \"\");\n assert (make_palindrome(\"cat\") == \"catac\");\n assert (make_palindrome(\"cata\") == \"catac\");\n}\n"}
+{"task_id": "CPP/11", "prompt": "/*\nInput are two strings a and b consisting only of 1s and 0s.\nPerform binary XOR on these inputs and return result also as a string.\n>>> string_xor(\"010\", \"110\")\n\"100\"\n*/\n#include\n#include\nusing namespace std;\nstring string_xor(string a,string b){\n", "canonical_solution": " string output=\"\";\n for (int i=0;(i=a.length()) \n {\n output+=b[i];\n }\n else output+=a[i];\n }\n }\n return output;\n}\n", "test": "#undef NDEBUG\n#include\nint main(){\n assert (string_xor(\"111000\", \"101010\") == \"010010\");\n assert (string_xor(\"1\", \"1\") == \"0\");\n assert (string_xor(\"0101\", \"0000\") == \"0101\");\n\n}\n", "declaration": "#include\n#include\nusing namespace std;\n#include\n#include\n#include\nstring string_xor(string a,string b){\n", "example_test": "#undef NDEBUG\n#include\nint main(){\n assert (string_xor(\"010\", \"110\") == \"100\");\n}\n"}
+{"task_id": "CPP/12", "prompt": "/*\nOut of vector of strings, return the longest one. Return the first one in case of multiple\nstrings of the same length. Return None in case the input vector is empty.\n>>> longest({})\n\n>>> longest({\"a\", \"b\", \"c\"})\n\"a\"\n>>> longest({\"a\", \"bb\", \"ccc\"})\n\"ccc\"\n*/\n#include\n#include\n#include\nusing namespace std;\nstring longest(vector strings){\n", "canonical_solution": " string out;\n for (int i=0;iout.length()) out=strings[i];\n }\n return out;\n}\n", "test": "#undef NDEBUG\n#include\nint main(){\n assert (longest({}) == \"\");\n assert (longest({\"x\", \"y\", \"z\"}) == \"x\");\n assert (longest({\"x\", \"yyy\", \"zzzz\", \"www\", \"kkkk\", \"abc\"}) == \"zzzz\");\n}\n", "declaration": "#include\n#include\n#include\nusing namespace std;\n#include\n#include\n#include\nstring longest(vector strings){\n", "example_test": "#undef NDEBUG\n#include\nint main(){\n assert (longest({}) == \"\");\n assert (longest({\"a\", \"b\", \"c\"}) == \"a\");\n assert (longest({\"a\", \"bb\", \"ccc\"}) == \"ccc\");\n}\n"}
+{"task_id": "CPP/13", "prompt": "/*\nReturn a greatest common divisor of two integers a and b\n>>> greatest_common_divisor(3, 5)\n1\n>>> greatest_common_divisor(25, 15)\n5\n*/\n#include\nusing namespace std;\nint greatest_common_divisor(int a, int b){\n", "canonical_solution": " int out,m;\n while (true){\n if (a\nint main(){\n assert (greatest_common_divisor(3, 7) == 1);\n assert (greatest_common_divisor(10, 15) == 5);\n assert (greatest_common_divisor(49, 14) == 7);\n assert (greatest_common_divisor(144, 60) == 12);\n}\n", "declaration": "#include\nusing namespace std;\n#include\n#include\n#include\nint greatest_common_divisor(int a, int b){\n", "example_test": "#undef NDEBUG\n#include\nint main(){\n assert (greatest_common_divisor(3, 5) == 1);\n assert (greatest_common_divisor(25, 15) == 5);\n}\n"}
+{"task_id": "CPP/14", "prompt": "/*\nReturn vector of all prefixes from shortest to longest of the input string\n>>> all_prefixes(\"abc\")\n{\"a\", \"ab\", \"abc\"}\n*/\n#include\n#include\n#include\nusing namespace std;\nvector all_prefixes(string str){\n", "canonical_solution": " vector out;\n string current=\"\";\n for (int i=0;i\nbool issame(vector a,vectorb){\n if (a.size()!=b.size()) return false;\n for (int i=0;i\n#include\n#include\nusing namespace std;\n#include\n#include\n#include\nvector all_prefixes(string str){\n", "example_test": "#undef NDEBUG\n#include\nbool issame(vector a,vectorb){\n if (a.size()!=b.size()) return false;\n for (int i=0;i>> string_sequence(0)\n\"0\"\n>>> string_sequence(5)\n\"0 1 2 3 4 5\"\n*/\n#include\n#include\nusing namespace std;\nstring string_sequence(int n){\n", "canonical_solution": " string out=\"0\";\n for (int i=1;i<=n;i++)\n out=out+\" \"+to_string(i);\n return out;\n}\n", "test": "#undef NDEBUG\n#include\nint main(){\n assert (string_sequence(0) == \"0\");\n assert (string_sequence(3) == \"0 1 2 3\");\n assert (string_sequence(10) == \"0 1 2 3 4 5 6 7 8 9 10\");\n}\n", "declaration": "#include\n#include\n#include\nusing namespace std;\n#include\n#include\nstring string_sequence(int n){\n", "example_test": "#undef NDEBUG\n#include\nint main(){\n assert (string_sequence(0) == \"0\");\n assert (string_sequence(5) == \"0 1 2 3 4 5\");\n}\n"}
+{"task_id": "CPP/16", "prompt": "/*\nGiven a string, find out how many distinct characters (regardless of case) does it consist of\n>>> count_distinct_characters(\"xyzXYZ\")\n3\n>>> count_distinct_characters(\"Jerry\")\n4\n*/\n#include\n#include\n#include\n#include\nusing namespace std;\nint count_distinct_characters(string str){ \n", "canonical_solution": " vector distinct={};\n transform(str.begin(),str.end(),str.begin(),::tolower);\n for (int i=0;i\nint main(){\n assert (count_distinct_characters(\"\") == 0);\n assert (count_distinct_characters(\"abcde\") == 5);\n assert (count_distinct_characters(\"abcdecadeCADE\") == 5);\n assert (count_distinct_characters(\"aaaaAAAAaaaa\") == 1);\n assert (count_distinct_characters(\"Jerry jERRY JeRRRY\") == 5);\n}\n", "declaration": "#include\n#include\n#include\n#include\n#include\nusing namespace std;\n#include\nint count_distinct_characters(string str){ \n", "example_test": "#undef NDEBUG\n#include\nint main(){\n assert (count_distinct_characters(\"xyzXYZ\") == 3);\n assert (count_distinct_characters(\"Jerry\") == 4);\n}\n"}
+{"task_id": "CPP/17", "prompt": "/*\nInput to this function is a string representing musical notes in a special ASCII format.\nYour task is to parse this string and return vector of integers corresponding to how many beats does each\nnot last.\n\nHere is a legend:\n\"o\" - whole note, lasts four beats\n\"o|\" - half note, lasts two beats\n\".|\" - quater note, lasts one beat\n\n>>> parse_music(\"o o| .| o| o| .| .| .| .| o o\")\n{4, 2, 1, 2, 2, 1, 1, 1, 1, 4, 4}\n*/\n#include\n#include\n#include\nusing namespace std;\nvector parse_music(string music_string){ \n", "canonical_solution": " string current=\"\";\n vector out={};\n if (music_string.length()>0)\n music_string=music_string+' ';\n for (int i=0;i\nbool issame(vector a,vectorb){\n if (a.size()!=b.size()) return false;\n for (int i=0;i\n#include\n#include\n#include