initial commit

2025-06-05 18:08:36 +05:30
commit 9cc5f634ee
1 changed files with 142 additions and 0 deletions
--- a/ac_specs_extraction.py
+++ b/ac_specs_extraction.py
@@ -0,0 +1,142 @@
+import base64
+import vertexai
+from vertexai.generative_models import GenerativeModel, Part, SafetySetting
+import io
+from PIL import Image
+import json
+import os
+import pandas as pd
+from tqdm import tqdm
+
+
+def image_to_base64(image_path):
+    with open(image_path, "rb") as image_file:
+        image_bytes = image_file.read()
+        encoded_string = base64.b64encode(image_bytes).decode("utf-8")
+    return encoded_string
+
+def generate(image_path):
+    vertexai.init(project="lt-dev-prj-3d3b5f2a", location="asia-south1")
+    model = GenerativeModel(
+        "gemini-1.5-pro-002",
+    )
+
+    generation_config = {
+        "max_output_tokens": 8192,
+        "temperature": 0.3,
+        "top_p": 0.95,
+        "response_mime_type": "application/json"
+    }
+
+    safety_settings = [
+        SafetySetting(
+            category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
+            threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
+        ),
+        SafetySetting(
+            category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
+            threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
+        ),
+        SafetySetting(
+            category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
+            threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
+        ),
+        SafetySetting(
+            category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
+            threshold=SafetySetting.HarmBlockThreshold.BLOCK_NONE
+        ),
+    ]
+
+    base64_string = image_to_base64(image_path)
+
+    image = Part.from_data(
+        mime_type="image/jpeg",
+        data=base64.b64decode(base64_string)
+    )
+
+    prompt = """Extract the following information from the image in JSON format.
+            STAR RATING --> INTEGER
+            ISEER --> FLOAT
+            EER (W/W) --> FLOAT
+            Label Period Start Date --> DATE
+            Label Period End Date --> DATE
+            ApplianceType --> STRING
+            Brand --> STRING
+            Model --> STRING
+            Year --> DATE (YYYY)
+            Cooling Capacity (100%) (W) --> INTEGER
+            Cooling Capacity (50%)(W) --> INTEGER
+            Cooling Capacity (W) --> INTEGER
+            Electricity Consumption (UNITS) --> FLOAT
+            Power Consumption (W) --> FLOAT
+            Compressor Type --> STRING
+            Variable Speed Compressor --> BOOLEAN(YES/NO)
+            Heat Pump --> BOOLEAN(YES/NO)
+            MODEL NO. --> STRING
+            MODEL NAME. --> STRING
+            INPUT POWER --> INTEGER
+            REFRIGERANT --> STRING
+            ISO --> STRING
+            OPERATING HOURS PER ANNUM --> INTEGER
+
+            Note: STAR RATING is usually highlighted in Red. SO all the starts with background in RED are considered as STAR RATING. EG.
+            If only one star has background RED then the STAR RATING WILL BE 1. If all 5 starts have Background RED then the STAR RATING
+            will be 5.
+
+            Provide blank value "" for fields that are not applicable/found in the image.
+            The value for boolean fields should be "YES" or "NO".
+
+            There should strictly no other keys apart from the mentioned above. Even the naming conventions should not change.
+            """
+
+    responses = model.generate_content(
+        [image, prompt],
+        generation_config=generation_config,
+        safety_settings=safety_settings,
+    )
+
+    return json.loads(responses.text)
+
+def process_files(directory_path):
+    list_df = []
+    print(f"Processing files in directory: {directory_path}")
+    image_files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f)) and f.lower().endswith(('.jpg', '.jpeg', '.png', '.gif'))]
+
+    if not image_files:
+        print("No image files found in the directory.")
+        return pd.DataFrame(list_df)
+
+    for filename in tqdm(image_files, desc="Processing images"):
+        filepath = os.path.join(directory_path, filename)
+        try:
+            output_response = generate(filepath)
+            output_response["File Name"] = str(filename)
+            list_df.append(output_response)
+        except Exception as e:
+            print(f"Error processing file {filename}: {e}")
+            list_df.append({"File Name": str(filename), "Error": str(e)})
+            continue
+
+    data = pd.DataFrame(list_df)
+    return data
+
+if __name__ == "__main__":
+    directory_path = 'FOLDER4' # Make sure this folder exists and contains images
+
+    if not os.path.isdir(directory_path):
+        print(f"Error: Directory '{directory_path}' not found. Please create it or provide the correct path.")
+    else:
+        extracted_data = process_files(directory_path)
+
+        if not extracted_data.empty:
+            print("\n--- Extracted Data (First 5 rows) ---")
+            print(extracted_data.head())
+
+            output_csv_path = 'extracted_image_data.csv'
+            try:
+                extracted_data.to_csv(output_csv_path, index=False)
+                print(f"\nSuccessfully saved extracted data to {output_csv_path}")
+            except Exception as e:
+                print(f"\nError saving data to CSV: {e}")
+        else:
+            print("\nNo data was extracted.")