5

So I have been trying to multi-thread some internet connections in python. I have been using the multiprocessing module so I can get around the "Global Interpreter Lock". But it seems that the system only gives one open connection port to python, Or at least it only allows one connection to happen at once. Here is an example of what I am saying.

*Note that this is running on a linux server

from multiprocessing import Process, Queue
import urllib
import random

# Generate 10,000 random urls to test and put them in the queue
queue = Queue()
for each in range(10000):
    rand_num = random.randint(1000,10000)
    url = ('http://www.' + str(rand_num) + '.com')
    queue.put(url)

# Main funtion for checking to see if generated url is active
def check(q):
    while True:
        try:
            url = q.get(False)
            try:
                request = urllib.urlopen(url)
                del request
                print url + ' is an active url!'
            except:
                print url + ' is not an active url!'
        except:
            if q.empty():
                break

# Then start all the threads (50)
for thread in range(50):
    task = Process(target=check, args=(queue,))
    task.start()

So if you run this you will notice that it starts 50 instances on the function but only runs one at a time. You may think that the 'Global Interpreter Lock' is doing this but it isn't. Try changing the function to a mathematical function instead of a network request and you will see that all fifty threads run simultaneously.

So will I have to work with sockets? Or is there something I can do that will give python access to more ports? Or is there something I am not seeing? Let me know what you think! Thanks!

*Edit

So I wrote this script to test things better with the requests library. It seems as though I had not tested it very well with this before. (I had mainly used urllib and urllib2)

from multiprocessing import Process, Queue
from threading import Thread
from Queue import Queue as Q
import requests
import time

# A main timestamp
main_time = time.time()

# Generate 100 urls to test and put them in the queue
queue = Queue()
for each in range(100):
    url = ('http://www.' + str(each) + '.com')
    queue.put(url)

# Timer queue
time_queue = Queue()

# Main funtion for checking to see if generated url is active
def check(q, t_q): # args are queue and time_queue
    while True:
        try:
            url = q.get(False)
            # Make a timestamp
            t = time.time()
            try:
                request = requests.head(url, timeout=5)
                t = time.time() - t
                t_q.put(t)
                del request
            except:
                t = time.time() - t
                t_q.put(t)
        except:
            break

# Then start all the threads (20)
thread_list = []
for thread in range(20):
    task = Process(target=check, args=(queue, time_queue))
    task.start()
    thread_list.append(task)

# Join all the threads so the main process don't quit
for each in thread_list:
    each.join()
main_time_end = time.time()

# Put the timerQueue into a list to get the average
time_queue_list = []
while True:
    try:
        time_queue_list.append(time_queue.get(False))
    except:
        break

# Results of the time
average_response = sum(time_queue_list) / float(len(time_queue_list))
total_time = main_time_end - main_time
line =  "Multiprocessing: Average response time: %s sec. -- Total time: %s sec." % (average_response, total_time)
print line

# A main timestamp
main_time = time.time()

# Generate 100 urls to test and put them in the queue
queue = Q()
for each in range(100):
    url = ('http://www.' + str(each) + '.com')
    queue.put(url)

# Timer queue
time_queue = Queue()

# Main funtion for checking to see if generated url is active
def check(q, t_q): # args are queue and time_queue
    while True:
        try:
            url = q.get(False)
            # Make a timestamp
            t = time.time()
            try:
                request = requests.head(url, timeout=5)
                t = time.time() - t
                t_q.put(t)
                del request
            except:
                t = time.time() - t
                t_q.put(t)
        except:
            break

# Then start all the threads (20)
thread_list = []
for thread in range(20):
    task = Thread(target=check, args=(queue, time_queue))
    task.start()
    thread_list.append(task)

# Join all the threads so the main process don't quit
for each in thread_list:
    each.join()
main_time_end = time.time()

# Put the timerQueue into a list to get the average
time_queue_list = []
while True:
    try:
        time_queue_list.append(time_queue.get(False))
    except:
        break

# Results of the time
average_response = sum(time_queue_list) / float(len(time_queue_list))
total_time = main_time_end - main_time
line =  "Standard Threading: Average response time: %s sec. -- Total time: %s sec." % (average_response, total_time)
print line

# Do the same thing all over again but this time do each url at a time
# A main timestamp
main_time = time.time()

# Generate 100 urls and test them
timer_list = []
for each in range(100):
    url = ('http://www.' + str(each) + '.com')
    t = time.time()
    try:
        request = requests.head(url, timeout=5)
        timer_list.append(time.time() - t)
    except:
        timer_list.append(time.time() - t)
main_time_end = time.time()

# Results of the time
average_response = sum(timer_list) / float(len(timer_list))
total_time = main_time_end - main_time
line = "Not using threads: Average response time: %s sec. -- Total time: %s sec." % (average_response, total_time)
print line

As you can see, it is multithreading very well. Actually, most of my tests show that the threading module is actually faster than the multiprocessing module. (I don't understand why!) Here are some of my results.

Multiprocessing: Average response time: 2.40511314869 sec. -- Total time: 25.6876308918 sec.
Standard Threading: Average response time: 2.2179402256 sec. -- Total time: 24.2941861153 sec.
Not using threads: Average response time: 2.1740363431 sec. -- Total time: 217.404567957 sec.

This was done on my home network, the response time on my server is much faster. I think my question has been answered indirectly, since I was having my problems on a much more complex script. All of the suggestions helped me optimize it very well. Thanks to everyone!

TysonU
  • 432
  • 4
  • 18
  • 1
    Have you tried a different python module for doing the HTTP legwork, maybe [requests](http://docs.python-requests.org/en/latest/)? We know `urllib` [isn't thread-safe](http://stackoverflow.com/a/5825531/228489), though I don't think that should affect multiprocess, but I'd try a different module to find out. – amccormack May 08 '15 at 18:48
  • 1
    How can you tell only a single process is running? I think what happens is that a mathematical function is much faster to complete than an http request and while it may seem like the run is synchronous, it's actually doing many requests but manages to write to standard output clearly because they're slow. – Reut Sharabani May 08 '15 at 18:49
  • @ReutSharabani Well, I had been checking it in "htop" but also it only prints one at a time. If it was actually running multiple processes it would print many out at once. – TysonU May 08 '15 at 18:54
  • 1
    This is what I get when I run the script: `reut@sharabani:~/python/ports$ pgrep python | wc -l` Out: `51` – Reut Sharabani May 08 '15 at 18:57
  • @ReutSharabani Yes. That is exactly what I get. In my knowledge that means that there are 51 python threads that have been started but it does not mean that all 51 threads are running. If you open 'htop' you will notice that you only have one or two threads running. – TysonU May 08 '15 at 19:04
  • @amccormack I just tried it with the requests library and got the same results. So its not the 'urllib' module. – TysonU May 08 '15 at 19:06

3 Answers3

1

it starts 50 instances on the function but only runs one at a time

You have misinterpreted the results of htop. Only a few, if any, copies of python will be runnable at any specific instance. Most of them will be blocked waiting for network I/O.

The processes are, in fact, running parallel.

Try changing the function to a mathematical function instead of a network request and you will see that all fifty threads run simultaneously.

Changing the task to a mathematical function merely illustrates the difference between CPU-bound (e.g. math) and IO-bound (e.g. urlopen) processes. The former is always runnable, the latter is rarely runnable.

it only prints one at a time. If it was actually running multiple processes it would print many out at once.

It prints one at a time because you are writing lines to a terminal. Because the lines are indistinguishable, you wouldn't be able to tell if they are written all by one thread, or each by a separate thread in turn.

Robᵩ
  • 163,533
  • 20
  • 239
  • 308
  • So my question is this - can a linux machine have more than one IO-bound process at once? I know that every port seems to be limited to one process. Is my system only opening one port for 'urlopen'? Could I not somehow manually have each process open itself on a new port and maybe have more success? – TysonU May 08 '15 at 19:35
  • 1) Yes, of course Linux can have multiple IO-bound processes. In fact, that is precisely what htop is showing you -- of your 50 processes you start, most of them are waiting for IO. 2) "I know that every port seems to be limited to one process." Balderdash. There is no such limitation. 3) "have more success" -- what, precisely are you trying to accomplish, and what makes you think that you're not already accomplishing it? – Robᵩ May 08 '15 at 19:38
  • If two threads try to print at the same time it usually results in printing multiple statements per line. – TysonU May 08 '15 at 19:39
  • Answering your comment as to what I am trying to accomplish. I am trying to get multiple requests to happen at the same time. For example: Each thread could be downloading something simultaneously. Its obviously not as you pointed out before. – TysonU May 08 '15 at 19:49
  • You have succeeded! You have multiple requests happening at the same time. Each thread is downloading simultaneously. Your program is working precisely as you expect it to. – Robᵩ May 08 '15 at 19:51
  • Well, its not happening for me. I am only sending one request at a time. The responses come back at random times but the system is only sending one http request at a time. – TysonU May 08 '15 at 20:09
  • Sorry to keep bothering you but I tried to send the request asking only for the 'HEAD' as a response. In your first comment to this answer you stated that my 50 processes for IO. If this is so then more of the processes should be active at the same time due to the fact that it gets the data back faster. But it does not change. There is still only one OI that is actually running. – TysonU May 08 '15 at 20:44
0

First of all, using multiprocessing to parallelize network I/O is an overkill. Using the built-in threading or a lightweight greenlet library like gevent are a much better option with less overhead. The GIL has nothing to do with blocking IO calls, so you don't have to worry about that at all.

Secondly, an easy way to see if your subprocesses/threads/greenlets are running in parallel if you are monitoring stdout is to print out something at the very beginning of the function, right after the subprocesses/threads/greenlets are spawned. For example, modify your check() function like so

def check(q):
    print 'Start checking urls!'
    while True:
        ...

If your code is correct, you should see many Start checking urls! lines printed out before any of the url + ' is [not] an active url!' printed out. It works on my machine, so it looks like your code is correct.

oxymor0n
  • 1,089
  • 7
  • 15
  • I think the question here is not *"does check run in parallel?"*. It's *"does urllib.urlopen run in parallel?"*. – Reut Sharabani May 08 '15 at 19:14
  • if `check()` runs in parallel, then `urllib.urlopen()` would run in parallel (unless there's something seriously wrong with his file descriptor settings, which I doubt). if you want proof, run `check()` in a sequential manner (i.e., replace the `for thread in range...` block with `check(queue)` and you'd see that the url checking takes a lot more time. – oxymor0n May 08 '15 at 19:21
  • I think it **does** run in parallel, I'm just saying that your answer ignores the question. He clearly stated that a mathematical calculation does run in parallel for him. – Reut Sharabani May 08 '15 at 19:24
  • @oxymor0n Just like Reut said, i'm not worried about how many normal functions I can start in parallel. My problem is that my system seems to be limiting python to only one network request at once. So I can have many 'check's running in parallel but they all wait on each other to finish the requests because its only allowing one at a time. – TysonU May 08 '15 at 19:26
  • Reut Sharabani I see your point now. @TysonU have your compared the speed of the parallel & the sequential version? On my machine, your code runs perfectly, so it's not the cause of the problem here. If the parallel & sequential version runs at the same speed, please check the number of your file descriptors – oxymor0n May 08 '15 at 19:31
  • @oxymor0n The parallel finishes a few seconds faster. I think its because it has the request prepared and waiting for a network opening, and the sequential version has to wait for the loop each time to pull the information out of the queue and make the request with it. I'm not to good with file descriptors but i'll check it out. – TysonU May 08 '15 at 19:55
0

It appears that your issue is actually with the serial behavior of gethostbyname(3). This is discussed in this SO thread.

Try this code that uses the Twisted asynchronous I/O library:

import random
import sys
from twisted.internet import reactor
from twisted.internet import defer
from twisted.internet.task import cooperate
from twisted.web import client

SIMULTANEOUS_CONNECTIONS = 25
# Generate 10,000 random urls to test and put them in the queue
pages = []
for each in range(10000):
    rand_num = random.randint(1000,10000)
    url = ('http://www.' + str(rand_num) + '.com')
    pages.append(url)

# Main function for checking to see if generated url is active
def check(page):
    def successback(data, page):
        print "{} is an active URL!".format(page)

    def errback(err, page):
        print "{} is not an active URL!; errmsg:{}".format(page, err.value)

    d = client.getPage(page, timeout=3) # timeout in seconds
    d.addCallback(successback, page)
    d.addErrback(errback, page)
    return d

def generate_checks(pages):
    for i in xrange(0, len(pages)):
        page = pages[i]
        #print "Page no. {}".format(i)
        yield check(page)

def work(pages):
    print "started work(): {}".format(len(pages))
    batch_size = len(pages) / SIMULTANEOUS_CONNECTIONS
    for i in xrange(0, len(pages), batch_size):
        task = cooperate(generate_checks(pages[i:i+batch_size]))

print "starting..."
reactor.callWhenRunning(work, pages)
reactor.run()
Community
  • 1
  • 1
Jesse Spears
  • 136
  • 9