I've been trying to figure out how to unzip then split a large csv file into chunks but keep the header for each file. I've tried a few things here and there but am not savvy enough to make it work.
Here's the code that I'm using:
def run():
# Get filename
subprocess.call( "path to file" ./original --recursive".split())
filename = subprocess.check_output("ls original/ ".split()).strip()
filename = 'original/' + filename
enter code here
# Ungzip then split file
file_content = ungzip(filename)
zip_content(split_content)
def ungzip(filename):
with gzip.open(filename) as f:
counter = 0;
name_counter = 0
split_content = ''
for line in f:
counter = counter + 1
split_content = split_content + str(line)
if(counter >= 250000):
print '-----------'
name_counter = name_counter + 1
name_string = 'file' + str(name_counter)
print "zipping " + name_string
zip_content(split_content, name_string )
split_content = ''
counter = 0
return file_content