panik
clone your own copy | download snapshot

Snapshots | iceberg

Inside this repository

get_commons.py
text/x-python

Download raw (1.3 KB)

#!/usr/bin/env python
# print the urls of all the images in a category of Wikimedia Commons
# example:
# $ python get_commons.py "Category:ScottForesman-raw"

# pipe to wget for download:
# $ python get_commons.py [category] | wget -i - --wait 1

import sys
import json
import urllib2
from urllib import quote

def make_api_query(category, q_continue=""):
    if q_continue:
        q_continue = '&gcmcontinue=' + q_continue
    url = 'http://commons.wikimedia.org/w/api.php?action=query&generator=categorymembers&gcmtitle=' + category + q_continue + '&gcmlimit=500&prop=imageinfo&iiprop=url&format=json'
    print url
    request = json.loads(urllib2.urlopen(url).read())
    if 'error' in request:
        sys.exit(request['error']['info'])
    for page in request['query']['pages'].values():
        try:
            print page['imageinfo'][0]['url']
        except KeyError: pass
    # there is a maximum of 500 results in one request, for paging
    # we use the query-continue value:
    if 'query-continue' in request:
        q_continue = quote(request['query-continue']['categorymembers']['gcmcontinue'])
        make_api_query(category, q_continue)

if __name__ == "__main__":
    if len(sys.argv) != 2:
        sys.exit("usage: python get_commons.py [category]")
    make_api_query(sys.argv[1])