I use the code in README to run using single gpu and multi gpu. I find that multi

BELOW ARE THE MULIT GPU CODE: <div class="highlight highlight-source-python notran

why multi gpu is slower than single gpu about effectivetensorflow HOT 3 CLOSED

vahidk commented on May 5, 2024

why multi gpu is slower than single gpu

from effectivetensorflow.

Comments (3)

vahidk commented on May 5, 2024

I can't tell why without seeing the code.

from effectivetensorflow.

realbns2008 commented on May 5, 2024

BELOW ARE THE MULIT GPU CODE:

#data parallisim

import numpy as np
import tensorflow as tf
import time
start_time=time.time()

def make_parallel(fn, num_gpus, **kwargs):
    in_splits = {}
    for k, v in kwargs.items():
        in_splits[k] = tf.split(v, num_gpus)

    out_split = []
    for i in range(num_gpus):
        with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)):
            with tf.variable_scope(tf.get_variable_scope(), reuse=i > 0):
                out_split.append(fn(**{k : v[i] for k, v in in_splits.items()}))

    return tf.concat(out_split, axis=0)

def model(x, y):
    w = tf.get_variable("W", shape=[3, 1])

    f = tf.stack([tf.square(x), x, tf.ones_like(x)], 1)
    yhat = tf.squeeze(tf.matmul(f, w), 1)
    loss = tf.square(yhat - y)
    return loss

x = tf.placeholder(tf.float32)
y = tf.placeholder(tf.float32)

#vvvvvvvvvvvvvv!!!parallel!!!vvvvvvvvvvvvvvvvvvvvvv#
loss = make_parallel(model, 4, x=x, y=y)
#^^^^^^^^^^^^^^!!!parallel!!!^^^^^^^^^^^^^^^^^^^^^^#

#vvvvvvvvvvvvvv!!!parallel!!!vvvvvvvvvvvvvvvvvvvvvv#
train_op = tf.train.AdamOptimizer(0.1).minimize(
    tf.reduce_mean(loss),
    colocate_gradients_with_ops=True)
#^^^^^^^^^^^^^^!!!parallel!!!^^^^^^^^^^^^^^^^^^^^^^#

def generate_data():
    x_val = np.random.uniform(-10.0, 10.0, size=1024 * 1024 * 100)
    y_val = 5 * np.square(x_val) + 3
    return x_val, y_val

sess = tf.Session()
sess.run(tf.global_variables_initializer())
for i in range(10):
    print "%d %.4f" % (i,(time.time()-start_time))
    x_val, y_val = generate_data()
    _, loss_val = sess.run([train_op, loss], {x: x_val, y: y_val})

_, loss_val = sess.run([train_op, loss], {x: x_val, y: y_val})
print(sess.run(tf.contrib.framework.get_variables_by_name("W")))
print loss_val
end_time = time.time()
print "time: %.4f" % (end_time-start_time)

BELOW ARE THE SINGLE GPU CODE

# before data parallisim

import numpy as np
import tensorflow as tf
import time
start_time=time.time()

def model(x, y):
    w = tf.get_variable("W", shape=[3, 1])

    f = tf.stack([tf.square(x), x, tf.ones_like(x)], 1)
    yhat = tf.squeeze(tf.matmul(f, w), 1)
    loss = tf.square(yhat - y)
    return loss

x = tf.placeholder(tf.float32)
y = tf.placeholder(tf.float32)

loss = model(x, y)

train_op = tf.train.AdamOptimizer(0.1).minimize(
    tf.reduce_mean(loss))

def generate_data():
    x_val = np.random.uniform(-10.0, 10.0, size=1024 * 1024 * 100)
    y_val = 5 * np.square(x_val) + 3
    return x_val, y_val

sess = tf.Session()
sess.run(tf.global_variables_initializer())
for i in range(10):
    print "%d %.4f" % (i,(time.time()-start_time))
    x_val, y_val = generate_data()
    _, loss_val = sess.run([train_op, loss], {x: x_val, y: y_val})

_, loss_val = sess.run([train_op, loss], {x: x_val, y: y_val})
print(sess.run(tf.contrib.framework.get_variables_by_name("W")))
print loss_val
end_time = time.time()
print "time: %.4f" % (end_time-start_time)

from effectivetensorflow.

vahidk commented on May 5, 2024

This is not the correct way to evaluate the runtime performance. I fixed it:

# before data parallisim

import numpy as np
import tensorflow as tf
import time

tf.reset_default_graph()

def make_parallel(fn, num_gpus, **kwargs):
    in_splits = {}
    for k, v in kwargs.items():
        in_splits[k] = tf.split(v, num_gpus)

    out_split = []
    for i in range(num_gpus):
        with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)):
            with tf.variable_scope(tf.get_variable_scope(), reuse=i > 0):
                out_split.append(fn(**{k : v[i] for k, v in in_splits.items()}))

    return tf.concat(out_split, axis=0)

def model(x, y):
    w = tf.get_variable("W", shape=[3, 1])

    f = tf.stack([tf.square(x), x, tf.ones_like(x)], 1)
    yhat = tf.squeeze(tf.matmul(f, w), 1)
    loss = tf.square(yhat - y)
    return loss

x = tf.placeholder(tf.float32)
y = tf.placeholder(tf.float32)

loss = model(x, y)
# loss = make_parallel(model, 2, x=x, y=y)

train_op = tf.train.AdamOptimizer(0.1).minimize(
    tf.reduce_mean(loss),
    colocate_gradients_with_ops=True)

def generate_data():
    x_val = np.random.uniform(-10.0, 10.0, size=1024 * 1024 * 100)
    y_val = 5 * np.square(x_val) + 3
    return x_val, y_val

sess = tf.Session()
sess.run(tf.global_variables_initializer())

diffs = []
for i in range(10):
    x_val, y_val = generate_data()
    start_time=time.time()
    _, loss_val = sess.run([train_op, loss], {x: x_val, y: y_val})
    diff = time.time() - start_time
    diffs.append(diff)
    print "%d %.4f" % (i,(diff))

_, loss_val = sess.run([train_op, loss], {x: x_val, y: y_val})
print(sess.run(tf.contrib.framework.get_variables_by_name("W")))
print loss_val
print "time: %.4f" % np.median(diffs)

This is the result:

1 GPU: 0.7662
2 GPUs: 0.6438

Furthermore this model is too simple. You'd see much larger gains if your model had more computation. The current bottleneck is mostly data transfer between cpu/gpu.

from effectivetensorflow.

why multi gpu is slower than single gpu about effectivetensorflow HOT 3 CLOSED

Comments (3)

Related Issues (18)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent

Jobs