diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp
index 6a79f83495a56907fec9d3f77b581eddd3a8baeb..e5493a381a6f9e3d135c14649a8e1e438494d363 100644
--- a/paddle/api/Matrix.cpp
+++ b/paddle/api/Matrix.cpp
@@ -52,6 +52,20 @@ Matrix* Matrix::createDense(const std::vector<float>& data, size_t height,
   return m;
 }
 
+Matrix* Matrix::createDenseFromNumpy(float* data, int dim1, int dim2,
+                                      bool copy, bool useGpu)
+                                     throw (UnsupportError) {
+  if (useGpu) {
+    /// Gpu mode only supports copy=True
+    if (!copy) {
+      throw UnsupportError("Gpu mode only supports copy=True");
+    }
+    return Matrix::createGpuDenseFromNumpy(data, dim1, dim2);
+  } else {
+    return Matrix::createCpuDenseFromNumpy(data, dim1, dim2, copy);
+  }
+}
+
 Matrix* Matrix::createCpuDenseFromNumpy(float* data, int dim1, int dim2,
                                         bool copy) {
   auto m = new Matrix();
diff --git a/paddle/api/Paddle.swig b/paddle/api/Paddle.swig
index a09f24ce1ccf5d026bf9431255c258483854b74b..6a0fbc537d9345f2221ab65d90733f4696be6880 100644
--- a/paddle/api/Paddle.swig
+++ b/paddle/api/Paddle.swig
@@ -4,6 +4,13 @@
 #define SWIG_FILE_WITH_INIT
 #include "api/PaddleAPI.h"   
 %}
+
+%include "exception.i"
+%typemap(throws) UnsupportError %{
+  SWIG_exception(SWIG_RuntimeError, $1.what());
+  SWIG_fail;
+%}
+
 %include "std_vector.i"
 %include "std_pair.i"
 #ifdef SWIGPYTHON
@@ -133,14 +140,21 @@ namespace std {
 %newobject Matrix::createZero;
 %newobject Matrix::createSparse;
 %newobject Matrix::createDense;
+%newobject Matrix::createDenseFromNumpy;
+%newobject Matrix::createCpuDenseFromNumpy;
+%newobject Matrix::createGpuDenseFromNumpy;
 %newobject Vector::createZero;
 %newobject Vector::create;
+%newobject Vector::createVectorFromNumpy;
 %newobject Vector::createCpuVectorFromNumpy;
 %newobject Vector::createGpuVectorFromNumpy;
 %newobject IVector::createZero;
 %newobject IVector::create;
+%newobject IVector::createVectorFromNumpy;
+%newobject IVector::createCpuVectorFromNumpy;
+%newobject IVector::createGpuVectorFromNumpy;
 %newobject Trainer::createByCommandLine;
-%newobject Trainer::getNetworkOutput;
+%newobject Trainer::getForwardOutput;
 %newobject Trainer::getLayerOutput;
 %newobject Arguments::getSlotValue;
 %newobject Arguments::getSlotIds;
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index cf790f2f8ef1dbdce37b279227e95328490c518d..5688ece44d2d58a2184a9f23d4af26c51c319579 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <stddef.h>
 #include <stdint.h>
 #include <string>
+#include <stdexcept>
 #include <vector>
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/TypeDefs.h"
@@ -42,6 +43,12 @@ using namespace paddle::enumeration_wrapper;  // NOLINT
  */
 void initPaddle(int argc, char** argv);
 
+/// Return FLAGS_use_gpu
+bool isUsingGpu();
+
+/// Set the Flags_use_gpu to the given parameter
+void setUseGpu(bool useGpu);
+
 /// Return true if this py_paddle is compiled in GPU Version
 bool isGpuVersion();
 
@@ -52,7 +59,11 @@ class IOError {};
 class RangeError {};
 
 /// Not support Error, such as access GPU memory directly, etc.
-class UnsupportError {};
+class UnsupportError : public std::runtime_error {
+public:
+  UnsupportError() : std::runtime_error(" ") {};
+  UnsupportError(const std::string& message) : std::runtime_error(message) {};
+};
 
 /// This type will map to python's list of float.
 struct FloatArray {
@@ -101,7 +112,8 @@ public:
   /**
    * Create A Matrix with height,width, which is filled by zero.
    */
-  static Matrix* createZero(size_t height, size_t width, bool useGpu = false);
+  static Matrix* createZero(size_t height, size_t width,
+                            bool useGpu = isUsingGpu());
 
   /**
    * Create Sparse Matrix.
@@ -114,7 +126,7 @@ public:
    */
   static Matrix* createSparse(size_t height, size_t width, size_t nnz,
                               bool isNonVal = true, bool trans = false,
-                              bool useGpu = false);
+                              bool useGpu = isUsingGpu());
 
   /**
    * Create Dense Matrix.
@@ -123,7 +135,12 @@ public:
    * @note        the value will be copy into a new matrix.
    */
   static Matrix* createDense(const std::vector<float>& data, size_t height,
-                             size_t width, bool useGpu = false);
+                             size_t width, bool useGpu = isUsingGpu());
+
+  static Matrix* createDenseFromNumpy(float* data, int dim1, int dim2,
+                                      bool copy = true,
+                                      bool useGpu = isUsingGpu())
+                                      throw (UnsupportError);
 
   /**
    *  Create Cpu Dense Matrix from numpy matrix, dtype=float32
@@ -221,15 +238,19 @@ public:
   ~Vector();
 
   /// Create Vector filled with zero.
-  static Vector* createZero(size_t sz, bool useGpu = false);
+  static Vector* createZero(size_t sz, bool useGpu = isUsingGpu());
 
   /**
    * Create Vector from list of float.
    *
    * It will create a new vector, and copy data into it.
    */
-  static Vector* create(const std::vector<float>& data, bool useGpu = false);
+  static Vector* create(const std::vector<float>& data,
+                        bool useGpu = isUsingGpu());
 
+  static Vector* createVectorFromNumpy(float* data, int dim, bool copy = true,
+                                       bool useGpu = isUsingGpu())
+                                       throw (UnsupportError);
   /**
    * Create Cpu Vector from numpy array, which dtype=float32
    *
@@ -259,6 +280,9 @@ public:
   /// Return is GPU vector or not.
   bool isGpu() const;
 
+  /// Return a list of float, the memory is alloced and copied.
+  FloatArray getData() const;
+
   /// __len__ in python
   size_t getSize() const;
 
@@ -279,13 +303,18 @@ class IVector {
 
 public:
   /// Create IVector filled with zero
-  static IVector* createZero(size_t sz, bool useGpu = false);
+  static IVector* createZero(size_t sz, bool useGpu = isUsingGpu());
 
   /**
    * Create IVector from list of int.
    * It will create a new vector, and copy data into it.
    */
-  static IVector* create(const std::vector<int>& data, bool useGpu = false);
+  static IVector* create(const std::vector<int>& data,
+                         bool useGpu = isUsingGpu());
+
+  static IVector* createVectorFromNumpy(int* data, int dim, bool copy = true,
+                                        bool useGpu = isUsingGpu())
+                                        throw (UnsupportError);
 
   /**
    * Create Cpu IVector from numpy array, which dtype=int32
@@ -297,7 +326,7 @@ public:
   /**
    * Create Gpu IVector from numpy array, which dtype=int32
    */
-  static IVector* createGpuVectorFromNumy(int* data, int dim);
+  static IVector* createGpuVectorFromNumpy(int* data, int dim);
 
   /// Cast to numpy array inplace.
   void toNumpyArrayInplace(int** view_data, int* dim1) throw(UnsupportError);
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index 8a6741078f2f19d8c3cb081f129447d6fc5801c9..a8932351a685474a756c3f5b0e5e8c42bbf58237 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -41,6 +41,10 @@ IntWithFloatArray::IntWithFloatArray(const float* v, const int* i, size_t l,
                                      bool f)
     : valBuf(v), idxBuf(i), length(l), needFree(f) {}
 
+bool isUsingGpu() {return FLAGS_use_gpu;}
+
+void setUseGpu(bool useGpu) {FLAGS_use_gpu = useGpu;}
+
 bool isGpuVersion() {
 #ifdef PADDLE_ONLY_CPU
   return false;
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index 1affc1a5fefb8a1109d2a442db10b7d7641cd9ee..d44cdefc35bd09e04412b52fb9981947caf89588 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -39,6 +39,19 @@ IVector* IVector::create(const std::vector<int>& data, bool useGpu) {
   return v;
 }
 
+IVector* IVector::createVectorFromNumpy(int* data, int dim, bool copy,
+                                        bool useGpu) throw (UnsupportError){
+  if (useGpu) {
+    /// if use gpu only copy=true is supported
+    if (!copy) {
+      throw UnsupportError("Gpu mode only supports copy=True");
+    }
+    return IVector::createGpuVectorFromNumpy(data, dim);
+  } else {
+    return IVector::createCpuVectorFromNumpy(data, dim, copy);
+  }
+}
+
 IVector* IVector::createCpuVectorFromNumpy(int* data, int dim, bool copy) {
   auto v = new IVector();
   if (copy) {
@@ -50,7 +63,7 @@ IVector* IVector::createCpuVectorFromNumpy(int* data, int dim, bool copy) {
   return v;
 }
 
-IVector* IVector::createGpuVectorFromNumy(int* data, int dim) {
+IVector* IVector::createGpuVectorFromNumpy(int* data, int dim) {
   auto v = new IVector();
   v->m->vec = paddle::IVector::create(dim, true);
   v->m->vec->copyFrom(data, dim);
@@ -188,12 +201,25 @@ Vector* Vector::createByPaddleVectorPtr(void* ptr) {
   }
 }
 
+Vector* Vector::createVectorFromNumpy(float* data, int dim, bool copy,
+                                      bool useGpu) throw (UnsupportError){
+  if (useGpu) {
+    /// if use gpu only copy=True is supported
+    if (!copy) {
+      throw UnsupportError("Gpu mode only supports copy=True");
+    }
+    return Vector::createGpuVectorFromNumpy(data, dim);
+  } else {
+    return Vector::createCpuVectorFromNumpy(data, dim, copy);
+  }
+}
+
 Vector* Vector::createCpuVectorFromNumpy(float* data, int dim, bool copy) {
   CHECK_GT(dim, 0);
   auto retVec = new Vector();
   if (copy) {
     retVec->m->vec = paddle::Vector::create((size_t)dim, false);
-    return retVec;
+    retVec->m->vec->copyFrom(data, dim);
   } else {
     retVec->m->vec = paddle::Vector::create(data, (size_t)dim, false);
   }
@@ -237,6 +263,21 @@ void Vector::copyFromNumpyArray(float* data, int dim) {
   m->vec->copyFrom(data, dim);
 }
 
+FloatArray Vector::getData() const {
+  if (this->isGpu()) {
+    float* src = m->vec->getData();
+    size_t len = m->vec->getSize();
+    float* dest = new float[len];
+    hl_memcpy_device2host(dest, src, len * sizeof(float));
+    FloatArray ret_val(dest, len);
+    ret_val.needFree = true;
+    return ret_val;
+  } else {
+    FloatArray ret_val(m->vec->getData(), m->vec->getSize());
+    return ret_val;
+  }
+}
+
 bool Vector::isGpu() const {
   return std::dynamic_pointer_cast<paddle::GpuVector>(m->vec) != nullptr;
 }
diff --git a/paddle/api/test/testMatrix.py b/paddle/api/test/testMatrix.py
index 2160612888b0f7ed6b504e5fc5933dfb3781f167..0432345edd659f13bddb1b99f62622c5ea64a4cb 100644
--- a/paddle/api/test/testMatrix.py
+++ b/paddle/api/test/testMatrix.py
@@ -42,7 +42,7 @@ class TestMatrix(unittest.TestCase):
         self.assertEqual(m.getSparseRowCols(2), [])
 
     def test_sparse_value(self):
-        m = swig_paddle.Matrix.createSparse(3, 3, 6, False)
+        m = swig_paddle.Matrix.createSparse(3, 3, 6, False, False, False)
         self.assertIsNotNone(m)
         m.sparseCopyFrom([0, 2, 3, 3], [0, 1, 2], [7.3, 4.2, 3.2])
 
@@ -66,7 +66,7 @@ class TestMatrix(unittest.TestCase):
         self.assertIsNotNone(m)
         self.assertTrue(abs(m.get(1, 1) - 0.5) < 1e-5)
 
-    def test_numpy(self):
+    def test_numpyCpu(self):
         numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
         m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat)
         self.assertEqual((int(m.getHeight()), int(m.getWidth())),
@@ -100,8 +100,20 @@ class TestMatrix(unittest.TestCase):
 
             for a, e in zip(gpu_m.getData(), [1.0, 3.23, 3.0, 4.0, 5.0, 6.0]):
                 self.assertAlmostEqual(a, e)
+    
+    def test_numpy(self):
+        numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
+        m = swig_paddle.Matrix.createDenseFromNumpy(numpy_mat)
+        self.assertEqual((int(m.getHeight()), int(m.getWidth())), numpy_mat.shape)
+        self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
+        for a, e in zip(m.getData(), [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]):
+            self.assertAlmostEqual(a, e)
 
 
 if __name__ == "__main__":
     swig_paddle.initPaddle("--use_gpu=0")
-    unittest.main()
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestMatrix)
+    unittest.TextTestRunner().run(suite)
+    if swig_paddle.isGpuVersion():
+        swig_paddle.setUseGpu(True)
+        unittest.main()
diff --git a/paddle/api/test/testVector.py b/paddle/api/test/testVector.py
index 5226df79eea3bedbf2b5b6f5fa684cc99a194f7c..48aaa1d73da9e6c207ad5fa2be14a531267bd901 100644
--- a/paddle/api/test/testVector.py
+++ b/paddle/api/test/testVector.py
@@ -20,20 +20,28 @@ import unittest
 
 class TestIVector(unittest.TestCase):
     def test_createZero(self):
-        m = swig_paddle.IVector.createZero(10)
+        m = swig_paddle.IVector.createZero(10, False)
         self.assertIsNotNone(m)
         for i in xrange(10):
             self.assertEqual(m[i], 0)
             m[i] = i
             self.assertEqual(m[i], i)
+        
+        m = swig_paddle.IVector.createZero(10)
+        self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
+        self.assertEqual(m.getData(), [0]*10)
 
     def test_create(self):
-        m = swig_paddle.IVector.create(range(10))
+        m = swig_paddle.IVector.create(range(10), False)
         self.assertIsNotNone(m)
         for i in xrange(10):
             self.assertEqual(m[i], i)
+        
+        m = swig_paddle.IVector.create(range(10))
+        self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
+        self.assertEqual(m.getData(), range(10))
 
-    def test_numpy(self):
+    def test_cpu_numpy(self):
         vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
         iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec)
         self.assertEqual(vec.shape[0], int(iv.__len__()))
@@ -61,25 +69,43 @@ class TestIVector(unittest.TestCase):
             expect_vec = range(0, 10)
             expect_vec[4] = 7
             self.assertEqual(vec.getData(), expect_vec)
+    
+    def test_numpy(self):
+        vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
+        iv = swig_paddle.IVector.createVectorFromNumpy(vec)
+        self.assertEqual(iv.isGpu(), swig_paddle.isUsingGpu())
+        self.assertEqual(iv.getData(), list(vec))
 
 
 class TestVector(unittest.TestCase):
     def testCreateZero(self):
-        v = swig_paddle.Vector.createZero(10)
+        v = swig_paddle.Vector.createZero(10, False)
         self.assertIsNotNone(v)
         for i in xrange(len(v)):
             self.assertTrue(util.doubleEqual(v[i], 0))
             v[i] = i
             self.assertTrue(util.doubleEqual(v[i], i))
+        
+        v = swig_paddle.Vector.createZero(10)
+        self.assertEqual(v.isGpu(), swig_paddle.isUsingGpu())
+        self.assertEqual(v.getData(), [0]*10)
 
     def testCreate(self):
-        v = swig_paddle.Vector.create([x / 100.0 for x in xrange(100)])
+        v = swig_paddle.Vector.create([x / 100.0 for x in xrange(100)], False)
         self.assertIsNotNone(v)
         for i in xrange(len(v)):
             self.assertTrue(util.doubleEqual(v[i], i / 100.0))
         self.assertEqual(100, len(v))
+        
+        v = swig_paddle.Vector.create([x / 100.0 for x in xrange(100)])
+        self.assertEqual(v.isGpu(), swig_paddle.isUsingGpu())
+        self.assertEqual(100, len(v))
+        vdata = v.getData()
+        for i in xrange(len(v)):
+            self.assertTrue(util.doubleEqual(vdata[i], i / 100.0))
+        
 
-    def testNumpy(self):
+    def testCpuNumpy(self):
         numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
         vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr)
         assert isinstance(vec, swig_paddle.Vector)
@@ -102,9 +128,18 @@ class TestVector(unittest.TestCase):
 
         for i in xrange(1, len(numpy_3)):
             util.doubleEqual(numpy_3[i], vec[i])
+    
+    def testNumpy(self):
+        numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
+        vec = swig_paddle.Vector.createVectorFromNumpy(numpy_arr)
+        self.assertEqual(vec.isGpu(), swig_paddle.isUsingGpu())
+        vecData = vec.getData()
+        for n, v in zip(numpy_arr, vecData):
+            self.assertTrue(util.doubleEqual(n, v))
+        
 
     def testCopyFromNumpy(self):
-        vec = swig_paddle.Vector.createZero(1)
+        vec = swig_paddle.Vector.createZero(1, False)
         arr = np.array([1.3, 3.2, 2.4], dtype="float32")
         vec.copyFromNumpyArray(arr)
         for i in xrange(len(vec)):
@@ -112,6 +147,9 @@ class TestVector(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    swig_paddle.initPaddle("--use_gpu=1"
-                           if swig_paddle.isGpuVersion() else "--use_gpu=0")
-    unittest.main()
+    swig_paddle.initPaddle("--use_gpu=0")
+    suite = unittest.TestLoader().loadTestsFromTestCase(TestVector)
+    unittest.TextTestRunner().run(suite)
+    if swig_paddle.isGpuVersion():
+        swig_paddle.setUseGpu(True)
+        unittest.main()
\ No newline at end of file