import base64 import vertexai from vertexai.generative_models import GenerativeModel, Part, SafetySetting import io from PIL import Image import json import os import pandas as pd from tqdm import tqdm def image_to_base64(image_path): with open(image_path, "rb") as image_file: image_bytes = image_file.read() encoded_string = base64.b64encode(image_bytes).decode("utf-8") return encoded_string def generate(image_path): vertexai.init(project="lt-dev-prj-3d3b5f2a", location="asia-south1") model = GenerativeModel( "gemini-1.5-pro-002", ) generation_config = { "max_output_tokens": 8192, "temperature": 0.3, "top_p": 0.95, "response_mime_type": "application/json" } safety_settings = [ SafetySetting( category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH, threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE ), SafetySetting( category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE ), SafetySetting( category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE ), SafetySetting( category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT, threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE ), ] base64_string = image_to_base64(image_path) image = Part.from_data( mime_type="image/jpeg", data=base64.b64decode(base64_string) ) prompt = """Extract the following information from the image in JSON format. STAR RATING --> INTEGER ISEER --> FLOAT EER (W/W) --> FLOAT Label Period Start Date --> DATE Label Period End Date --> DATE ApplianceType --> STRING Brand --> STRING Model --> STRING Year --> DATE (YYYY) Cooling Capacity (100%) (W) --> INTEGER Cooling Capacity (50%)(W) --> INTEGER Cooling Capacity (W) --> INTEGER Electricity Consumption (UNITS) --> FLOAT Power Consumption (W) --> FLOAT Compressor Type --> STRING Variable Speed Compressor --> BOOLEAN(YES/NO) Heat Pump --> BOOLEAN(YES/NO) MODEL NO. --> STRING MODEL NAME. --> STRING INPUT POWER --> INTEGER REFRIGERANT --> STRING ISO --> STRING OPERATING HOURS PER ANNUM --> INTEGER Note: STAR RATING is usually highlighted in Red. SO all the starts with background in RED are considered as STAR RATING. EG. If only one star has background RED then the STAR RATING WILL BE 1. If all 5 starts have Background RED then the STAR RATING will be 5. Provide blank value "" for fields that are not applicable/found in the image. The value for boolean fields should be "YES" or "NO". There should strictly no other keys apart from the mentioned above. Even the naming conventions should not change. """ responses = model.generate_content( [image, prompt], generation_config=generation_config, safety_settings=safety_settings, ) return json.loads(responses.text) def process_files(directory_path): list_df = [] print(f"Processing files in directory: {directory_path}") image_files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f)) and f.lower().endswith(('.jpg', '.jpeg', '.png', '.gif'))] if not image_files: print("No image files found in the directory.") return pd.DataFrame(list_df) for filename in tqdm(image_files, desc="Processing images"): filepath = os.path.join(directory_path, filename) try: output_response = generate(filepath) output_response["File Name"] = str(filename) list_df.append(output_response) except Exception as e: print(f"Error processing file {filename}: {e}") list_df.append({"File Name": str(filename), "Error": str(e)}) continue data = pd.DataFrame(list_df) return data if __name__ == "__main__": directory_path = 'FOLDER4' # Make sure this folder exists and contains images if not os.path.isdir(directory_path): print(f"Error: Directory '{directory_path}' not found. Please create it or provide the correct path.") else: extracted_data = process_files(directory_path) if not extracted_data.empty: print("\n--- Extracted Data (First 5 rows) ---") print(extracted_data.head()) output_csv_path = 'extracted_image_data.csv' try: extracted_data.to_csv(output_csv_path, index=False) print(f"\nSuccessfully saved extracted data to {output_csv_path}") except Exception as e: print(f"\nError saving data to CSV: {e}") else: print("\nNo data was extracted.")