|
| 1 | +# coding: utf-8 |
| 2 | +""" |
| 3 | +Preprocess a raw json dataset into hdf5/json files for use in data_loader.lua |
| 4 | +
|
| 5 | +Input: json file that has the form |
| 6 | +[{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...] |
| 7 | +example element in this list would look like |
| 8 | +{'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895} |
| 9 | +
|
| 10 | +This script reads this json, does some basic preprocessing on the captions |
| 11 | +(e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays |
| 12 | +
|
| 13 | +Output: a json file and an hdf5 file |
| 14 | +The hdf5 file contains several fields: |
| 15 | +/images is (N,3,256,256) uint8 array of raw image data in RGB format |
| 16 | +/labels is (M,max_length) uint32 array of encoded labels, zero padded |
| 17 | +/label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the |
| 18 | + first and last indices (in range 1..M) of labels for each image |
| 19 | +/label_length stores the length of the sequence for each of the M sequences |
| 20 | +
|
| 21 | +The json file has a dict that contains: |
| 22 | +- an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed |
| 23 | +- an 'images' field that is a list holding auxiliary information for each image, |
| 24 | + such as in particular the 'split' it was assigned to. |
| 25 | +""" |
| 26 | + |
| 27 | +from __future__ import absolute_import |
| 28 | +from __future__ import division |
| 29 | +from __future__ import print_function |
| 30 | + |
| 31 | +import os |
| 32 | +import json |
| 33 | +import argparse |
| 34 | +import sys |
| 35 | +import hashlib |
| 36 | +from random import shuffle, seed |
| 37 | + |
| 38 | + |
| 39 | +def main(params): |
| 40 | + |
| 41 | + imgs = json.load(open(params['input_json'][0], 'r'))['images'] |
| 42 | + # tmp = [] |
| 43 | + # for k in imgs.keys(): |
| 44 | + # for img in imgs[k]: |
| 45 | + # img['filename'] = img['image_id'] # k+'/'+img['image_id'] |
| 46 | + # img['image_id'] = int( |
| 47 | + # int(hashlib.sha256(img['image_id']).hexdigest(), 16) % sys.maxint) |
| 48 | + # tmp.append(img) |
| 49 | + # imgs = tmp |
| 50 | + |
| 51 | + # create output json file |
| 52 | + out = {u'info': {u'description': u'This is stable 1.0 version of the 2014 MS COCO dataset.', u'url': u'http://mscoco.org', u'version': u'1.0', u'year': 2014, u'contributor': u'Microsoft COCO group', u'date_created': u'2015-01-27 09:11:52.357475'}, u'licenses': [{u'url': u'http://creativecommons.org/licenses/by-nc-sa/2.0/', u'id': 1, u'name': u'Attribution-NonCommercial-ShareAlike License'}, {u'url': u'http://creativecommons.org/licenses/by-nc/2.0/', u'id': 2, u'name': u'Attribution-NonCommercial License'}, {u'url': u'http://creativecommons.org/licenses/by-nc-nd/2.0/', u'id': 3, u'name': u'Attribution-NonCommercial-NoDerivs License'}, {u'url': u'http://creativecommons.org/licenses/by/2.0/', u'id': 4, u'name': u'Attribution License'}, {u'url': u'http://creativecommons.org/licenses/by-sa/2.0/', u'id': 5, u'name': u'Attribution-ShareAlike License'}, {u'url': u'http://creativecommons.org/licenses/by-nd/2.0/', u'id': 6, u'name': u'Attribution-NoDerivs License'}, {u'url': u'http://flickr.com/commons/usage/', u'id': 7, u'name': u'No known copyright restrictions'}, {u'url': u'http://www.usa.gov/copyright.shtml', u'id': 8, u'name': u'United States Government Work'}], u'type': u'captions'} |
| 53 | + out.update({'images': [], 'annotations': []}) |
| 54 | + |
| 55 | + cnt = 0 |
| 56 | + empty_cnt = 0 |
| 57 | + for i, img in enumerate(imgs): |
| 58 | + if img['split'] == 'train': |
| 59 | + continue |
| 60 | + out['images'].append( |
| 61 | + {u'id': img.get('cocoid', img['imgid'])}) |
| 62 | + for j, s in enumerate(img['sentences']): |
| 63 | + if len(s) == 0: |
| 64 | + continue |
| 65 | + s = ' '.join(s['tokens']) |
| 66 | + out['annotations'].append( |
| 67 | + {'image_id': out['images'][-1]['id'], 'caption': s, 'id': cnt}) |
| 68 | + cnt += 1 |
| 69 | + |
| 70 | + json.dump(out, open(params['output_json'], 'w')) |
| 71 | + print('wrote ', params['output_json']) |
| 72 | + |
| 73 | + |
| 74 | +if __name__ == "__main__": |
| 75 | + |
| 76 | + parser = argparse.ArgumentParser() |
| 77 | + |
| 78 | + # input json |
| 79 | + parser.add_argument('--input_json', nargs='+', required=True, |
| 80 | + help='input json file to process into hdf5') |
| 81 | + parser.add_argument('--output_json', default='data.json', |
| 82 | + help='output json file') |
| 83 | + |
| 84 | + args = parser.parse_args() |
| 85 | + params = vars(args) # convert to ordinary dict |
| 86 | + print('parsed input parameters:') |
| 87 | + print(json.dumps(params, indent=2)) |
| 88 | + main(params) |
| 89 | + |
0 commit comments