R
ronald.johnson
I have a csv file containing product information that is 700+ MB in
size. I'm trying to go through and pull out unique product ID's only
as there are a lot of multiples. My problem is that I am appending the
ProductID to an array and then searching through that array each time
to see if I've seen the product ID before. So each search takes longer
and longer. I let the script run for 2 hours before killing it and had
only run through less than 1/10 if the file.
Heres the code:
import string
def checkForProduct(product_id, product_list):
for product in product_list:
if product == product_id:
return 1
return 0
input_file="c:\\input.txt"
output_file="c:\\output.txt"
product_info = []
input_count = 0
input = open(input_file,"r")
output = open(output_file, "w")
for line in input:
break_down = line.split(",")
product_number = break_down[2]
input_count+=1
if input_count == 1:
product_info.append(product_number)
output.write(line)
output_count = 1
if not checkForProduct(product_number,product_info):
product_info.append(product_number)
output.write(line)
output_count+=1
output.close()
input.close()
print input_count
print output_count
size. I'm trying to go through and pull out unique product ID's only
as there are a lot of multiples. My problem is that I am appending the
ProductID to an array and then searching through that array each time
to see if I've seen the product ID before. So each search takes longer
and longer. I let the script run for 2 hours before killing it and had
only run through less than 1/10 if the file.
Heres the code:
import string
def checkForProduct(product_id, product_list):
for product in product_list:
if product == product_id:
return 1
return 0
input_file="c:\\input.txt"
output_file="c:\\output.txt"
product_info = []
input_count = 0
input = open(input_file,"r")
output = open(output_file, "w")
for line in input:
break_down = line.split(",")
product_number = break_down[2]
input_count+=1
if input_count == 1:
product_info.append(product_number)
output.write(line)
output_count = 1
if not checkForProduct(product_number,product_info):
product_info.append(product_number)
output.write(line)
output_count+=1
output.close()
input.close()
print input_count
print output_count