From 9cc5f634eee7c3ea6f528b274c500c29aac01f1a Mon Sep 17 00:00:00 2001 From: Aditya Thaker Date: Thu, 5 Jun 2025 18:08:36 +0530 Subject: [PATCH] initial commit --- ac_specs_extraction.py | 142 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 ac_specs_extraction.py diff --git a/ac_specs_extraction.py b/ac_specs_extraction.py new file mode 100644 index 0000000..a5ca9ce --- /dev/null +++ b/ac_specs_extraction.py @@ -0,0 +1,142 @@ +import base64 +import vertexai +from vertexai.generative_models import GenerativeModel, Part, SafetySetting +import io +from PIL import Image +import json +import os +import pandas as pd +from tqdm import tqdm + + +def image_to_base64(image_path): + with open(image_path, "rb") as image_file: + image_bytes = image_file.read() + encoded_string = base64.b64encode(image_bytes).decode("utf-8") + return encoded_string + +def generate(image_path): + vertexai.init(project="lt-dev-prj-3d3b5f2a", location="asia-south1") + model = GenerativeModel( + "gemini-1.5-pro-002", + ) + + generation_config = { + "max_output_tokens": 8192, + "temperature": 0.3, + "top_p": 0.95, + "response_mime_type": "application/json" + } + + safety_settings = [ + SafetySetting( + category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH, + threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE + ), + SafetySetting( + category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, + threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE + ), + SafetySetting( + category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, + threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE + ), + SafetySetting( + category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT, + threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE + ), + ] + + base64_string = image_to_base64(image_path) + + image = Part.from_data( + mime_type="image/jpeg", + data=base64.b64decode(base64_string) + ) + + prompt = """Extract the following information from the image in JSON format. + STAR RATING --> INTEGER + ISEER --> FLOAT + EER (W/W) --> FLOAT + Label Period Start Date --> DATE + Label Period End Date --> DATE + ApplianceType --> STRING + Brand --> STRING + Model --> STRING + Year --> DATE (YYYY) + Cooling Capacity (100%) (W) --> INTEGER + Cooling Capacity (50%)(W) --> INTEGER + Cooling Capacity (W) --> INTEGER + Electricity Consumption (UNITS) --> FLOAT + Power Consumption (W) --> FLOAT + Compressor Type --> STRING + Variable Speed Compressor --> BOOLEAN(YES/NO) + Heat Pump --> BOOLEAN(YES/NO) + MODEL NO. --> STRING + MODEL NAME. --> STRING + INPUT POWER --> INTEGER + REFRIGERANT --> STRING + ISO --> STRING + OPERATING HOURS PER ANNUM --> INTEGER + + Note: STAR RATING is usually highlighted in Red. SO all the starts with background in RED are considered as STAR RATING. EG. + If only one star has background RED then the STAR RATING WILL BE 1. If all 5 starts have Background RED then the STAR RATING + will be 5. + + Provide blank value "" for fields that are not applicable/found in the image. + The value for boolean fields should be "YES" or "NO". + + There should strictly no other keys apart from the mentioned above. Even the naming conventions should not change. + """ + + responses = model.generate_content( + [image, prompt], + generation_config=generation_config, + safety_settings=safety_settings, + ) + + return json.loads(responses.text) + +def process_files(directory_path): + list_df = [] + print(f"Processing files in directory: {directory_path}") + image_files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f)) and f.lower().endswith(('.jpg', '.jpeg', '.png', '.gif'))] + + if not image_files: + print("No image files found in the directory.") + return pd.DataFrame(list_df) + + for filename in tqdm(image_files, desc="Processing images"): + filepath = os.path.join(directory_path, filename) + try: + output_response = generate(filepath) + output_response["File Name"] = str(filename) + list_df.append(output_response) + except Exception as e: + print(f"Error processing file {filename}: {e}") + list_df.append({"File Name": str(filename), "Error": str(e)}) + continue + + data = pd.DataFrame(list_df) + return data + +if __name__ == "__main__": + directory_path = 'FOLDER4' # Make sure this folder exists and contains images + + if not os.path.isdir(directory_path): + print(f"Error: Directory '{directory_path}' not found. Please create it or provide the correct path.") + else: + extracted_data = process_files(directory_path) + + if not extracted_data.empty: + print("\n--- Extracted Data (First 5 rows) ---") + print(extracted_data.head()) + + output_csv_path = 'extracted_image_data.csv' + try: + extracted_data.to_csv(output_csv_path, index=False) + print(f"\nSuccessfully saved extracted data to {output_csv_path}") + except Exception as e: + print(f"\nError saving data to CSV: {e}") + else: + print("\nNo data was extracted.") \ No newline at end of file