Python built-in sum function vs. for loop performance

However if the loop is just adding 1 each iteration starting from 0 you could use the fast trick addition. The sum output should be 499999500000 for range(1000000)

import timeit

def sum1():
    s = 0
    for i in range(1000000):
        s += i
    #print s    
    return s

def sum2():

    return sum(range(1000000))

def sum3():
    s = range(1000000)
    s = ((s[1]+s[-1])/2) * (len(s)-1)
    #print(s)
    return s

print 'For Loop Sum:', timeit.timeit(sum1, number=10)
print 'Built-in Sum:', timeit.timeit(sum2, number=10)
print 'Fast Sum:', timeit.timeit(sum3, number=10)

#prints
#For Loop Sum: 1.8420711
#Built-in Sum: 1.1081646
#Fast Sum: 0.3191561

You can see the source code in Python/bltinmodule.c. It has special cases for ints and floats, but since the sum overflows to longs pretty quickly, that probably doesn't have a major performance impact here. The general-case logic is pretty similar to what you'd write in Python, just in C. The speedup is most likely due to the fact that it doesn't have to go through all the bytecode interpreting and error handling overhead:

static PyObject*
builtin_sum(PyObject *self, PyObject *args)
{
    PyObject *seq;
    PyObject *result = NULL;
    PyObject *temp, *item, *iter;

    if (!PyArg_UnpackTuple(args, "sum", 1, 2, &seq, &result))
        return NULL;

    iter = PyObject_GetIter(seq);
    if (iter == NULL)
        return NULL;

    if (result == NULL) {
        result = PyInt_FromLong(0);
        if (result == NULL) {
            Py_DECREF(iter);
            return NULL;
        }
    } else {
        /* reject string values for 'start' parameter */
        if (PyObject_TypeCheck(result, &PyBaseString_Type)) {
            PyErr_SetString(PyExc_TypeError,
                "sum() can't sum strings [use ''.join(seq) instead]");
            Py_DECREF(iter);
            return NULL;
        }
        Py_INCREF(result);
    }

#ifndef SLOW_SUM
    /* Fast addition by keeping temporary sums in C instead of new Python objects.
       Assumes all inputs are the same type.  If the assumption fails, default
       to the more general routine.
    */
    if (PyInt_CheckExact(result)) {
        long i_result = PyInt_AS_LONG(result);
        Py_DECREF(result);
        result = NULL;
        while(result == NULL) {
            item = PyIter_Next(iter);
            if (item == NULL) {
                Py_DECREF(iter);
                if (PyErr_Occurred())
                    return NULL;
                return PyInt_FromLong(i_result);
            }
            if (PyInt_CheckExact(item)) {
                long b = PyInt_AS_LONG(item);
                long x = i_result + b;
                if ((x^i_result) >= 0 || (x^b) >= 0) {
                    i_result = x;
                    Py_DECREF(item);
                    continue;
                }
            }
            /* Either overflowed or is not an int. Restore real objects and process normally */
            result = PyInt_FromLong(i_result);
            temp = PyNumber_Add(result, item);
            Py_DECREF(result);
            Py_DECREF(item);
            result = temp;
            if (result == NULL) {
                Py_DECREF(iter);
                return NULL;
            }
        }
    }

    if (PyFloat_CheckExact(result)) {
        double f_result = PyFloat_AS_DOUBLE(result);
        Py_DECREF(result);
        result = NULL;
        while(result == NULL) {
            item = PyIter_Next(iter);
            if (item == NULL) {
                Py_DECREF(iter);
                if (PyErr_Occurred())
                    return NULL;
                return PyFloat_FromDouble(f_result);
            }
            if (PyFloat_CheckExact(item)) {
                PyFPE_START_PROTECT("add", Py_DECREF(item); Py_DECREF(iter); return 0)
                f_result += PyFloat_AS_DOUBLE(item);
                PyFPE_END_PROTECT(f_result)
                Py_DECREF(item);
                continue;
            }
            if (PyInt_CheckExact(item)) {
                PyFPE_START_PROTECT("add", Py_DECREF(item); Py_DECREF(iter); return 0)
                f_result += (double)PyInt_AS_LONG(item);
                PyFPE_END_PROTECT(f_result)
                Py_DECREF(item);
                continue;
            }
            result = PyFloat_FromDouble(f_result);
            temp = PyNumber_Add(result, item);
            Py_DECREF(result);
            Py_DECREF(item);
            result = temp;
            if (result == NULL) {
                Py_DECREF(iter);
                return NULL;
            }
        }
    }
#endif

    for(;;) {
        item = PyIter_Next(iter);
        if (item == NULL) {
            /* error, or end-of-sequence */
            if (PyErr_Occurred()) {
                Py_DECREF(result);
                result = NULL;
            }
            break;
        }
        /* It's tempting to use PyNumber_InPlaceAdd instead of
           PyNumber_Add here, to avoid quadratic running time
           when doing 'sum(list_of_lists, [])'.  However, this
           would produce a change in behaviour: a snippet like

             empty = []
             sum([[x] for x in range(10)], empty)

           would change the value of empty. */
        temp = PyNumber_Add(result, item);
        Py_DECREF(result);
        Py_DECREF(item);
        result = temp;
        if (result == NULL)
            break;
    }
    Py_DECREF(iter);
    return result;
}

As dwanderson suggested, Numpy is one alternative. It is, indeed, if you want to do some maths. See this benchmark:

import numpy as np

r = range(1000000)       # 12.5 ms
s = sum(r)               # 7.9 ms

ar = np.arange(1000000)  # 0.5 ms
as = np.sum(ar)          # 0.6 ms

So both creating the list and summing it is much faster with numpy. This is mostly because the numpy.array is designed for this and is much more efficient than the list.

However, if we have a python list, then numpy is very slow, as its conversion from a list into a numpy.array is sluggish:

r = range(1000000)
ar = np.array(r)         # 102 ms

The speed difference is actually greater than 3 times, but you slow down either version by first creating a huge in-memory list of 1 million integers. Separate that out of the time trials:

>>> import timeit
>>> def sum1(lst):
...     s = 0
...     for i in lst:
...         s += i
...     return s
... 
>>> def sum2(lst):
...     return sum(lst)
... 
>>> values = range(1000000)
>>> timeit.timeit('f(lst)', 'from __main__ import sum1 as f, values as lst', number=100)
3.457869052886963
>>> timeit.timeit('f(lst)', 'from __main__ import sum2 as f, values as lst', number=100)
0.6696369647979736

The speed difference has risen to over 5 times now.

A for loop is executed as interpreted Python bytecode. sum() loops entirely in C code. The speed difference between interpreted bytecode and C code is large.

In addition, the C code makes sure not to create new Python objects if it can keep the sum in C types instead; this works for int and float results.

The Python version, disassembled, does this:

>>> import dis
>>> def sum1():
...     s = 0
...     for i in range(1000000):
...         s += i
...     return s
... 
>>> dis.dis(sum1)
  2           0 LOAD_CONST               1 (0)
              3 STORE_FAST               0 (s)

  3           6 SETUP_LOOP              30 (to 39)
              9 LOAD_GLOBAL              0 (range)
             12 LOAD_CONST               2 (1000000)
             15 CALL_FUNCTION            1
             18 GET_ITER            
        >>   19 FOR_ITER                16 (to 38)
             22 STORE_FAST               1 (i)

  4          25 LOAD_FAST                0 (s)
             28 LOAD_FAST                1 (i)
             31 INPLACE_ADD         
             32 STORE_FAST               0 (s)
             35 JUMP_ABSOLUTE           19
        >>   38 POP_BLOCK           

  5     >>   39 LOAD_FAST                0 (s)
             42 RETURN_VALUE        

Apart from the interpreter loop being slower than C, the INPLACE_ADD will create a new integer object (past 255, CPython caches small int objects as singletons).

You can see the C implementation in the Python mercurial code repository, but it explicitly states in the comments:

/* Fast addition by keeping temporary sums in C instead of new Python objects.
   Assumes all inputs are the same type.  If the assumption fails, default
   to the more general routine.
*/