detection

This commit is contained in:
SaiD 2025-12-10 11:25:53 +05:30
parent c445068773
commit f647a509d5
10 changed files with 368 additions and 256 deletions

View File

@ -0,0 +1,81 @@
Unknown
person
bicycle
car
motorcycle
airplane
bus
train
truck
boat
traffic light
fire hydrant
stop sign
parking meter
bench
bird
cat
dog
horse
sheep
cow
elephant
bear
zebra
giraffe
backpack
umbrella
handbag
tie
suitcase
frisbee
skis
snowboard
sports ball
kite
baseball bat
baseball glove
skateboard
surfboard
tennis racket
bottle
wine glass
cup
fork
knife
spoon
bowl
banana
apple
sandwich
orange
broccoli
carrot
hot dog
pizza
donut
cake
chair
couch
potted plant
bed
dining table
toilet
tv
laptop
mouse
remote
keyboard
cell phone
microwave
oven
toaster
sink
refrigerator
book
clock
vase
scissors
teddy bear
hair drier
toothbrush

View File

@ -1,116 +1,94 @@
package com.example.livingai.data.ml
import android.content.Context
import android.graphics.Bitmap
import android.graphics.Color
import android.graphics.Rect
import com.example.livingai.domain.ml.AIModel
import com.google.mlkit.vision.common.InputImage
import com.google.mlkit.vision.segmentation.subject.SubjectSegmentation
import com.google.mlkit.vision.segmentation.subject.SubjectSegmenterOptions
import kotlinx.coroutines.suspendCancellableCoroutine
import kotlin.coroutines.resume
import kotlin.coroutines.resumeWithException
import org.tensorflow.lite.Interpreter
import org.tensorflow.lite.support.common.FileUtil
import org.tensorflow.lite.support.image.ImageProcessor
import org.tensorflow.lite.support.image.TensorImage
import org.tensorflow.lite.support.image.ops.ResizeOp
import java.nio.ByteBuffer
import java.nio.ByteOrder
private const val MASK_COLOR = 0x5500FF00 // semi-transparent green overlay
class AIModelImpl(private val context: Context) : AIModel {
class AIModelImpl : AIModel {
private val objectDetector: Interpreter
private val labels: List<String>
private val segmenter by lazy {
val options = SubjectSegmenterOptions.Builder()
.enableForegroundBitmap()
init {
// Load the TFLite model from assets
val modelBuffer = FileUtil.loadMappedFile(context, "efficientdet-lite0.tflite")
val options = Interpreter.Options().apply { numThreads = 4 }
objectDetector = Interpreter(modelBuffer, options)
// Load labels from assets
labels = try {
FileUtil.loadLabels(context, "labels.txt")
} catch (e: Exception) {
e.printStackTrace()
emptyList()
}
}
override suspend fun detectObject(bitmap: Bitmap): ObjectDetectionResult? {
// Preprocess the image
val imageProcessor = ImageProcessor.Builder()
.add(ResizeOp(320, 320, ResizeOp.ResizeMethod.BILINEAR))
.build()
SubjectSegmentation.getClient(options)
var tensorImage = TensorImage.fromBitmap(bitmap)
tensorImage = imageProcessor.process(tensorImage)
// Prepare model inputs and outputs
// Based on crash: [1, 25, 4] vs [1, 10, 4]. The model outputs 25 detections, not 10.
val locations = Array(1) { Array(25) { FloatArray(4) } }
val classes = Array(1) { FloatArray(25) }
val scores = Array(1) { FloatArray(25) }
val numDetections = FloatArray(1)
val outputs = mapOf(
0 to locations,
1 to classes,
2 to scores,
3 to numDetections
)
// Run inference
objectDetector.runForMultipleInputsOutputs(arrayOf(tensorImage.buffer), outputs)
// Post-process the results
val bestDetection = scores[0].withIndex()
.maxByOrNull { it.value }
?.takeIf { it.value > 0.5f } // Confidence threshold
if (bestDetection != null) {
val index = bestDetection.index
val score = bestDetection.value
val location = locations[0][index] // [ymin, xmin, ymax, xmax]
val labelIndex = classes[0][index].toInt()
val label = labels.getOrElse(labelIndex) { "Unknown" }
// Convert normalized coordinates to absolute pixel values
val ymin = location[0] * bitmap.height
val xmin = location[1] * bitmap.width
val ymax = location[2] * bitmap.height
val xmax = location[3] * bitmap.width
val boundingBox = Rect(xmin.toInt(), ymin.toInt(), xmax.toInt(), ymax.toInt())
return ObjectDetectionResult(boundingBox, label, score)
}
return null
}
override fun deriveInference(bitmap: Bitmap): String = "Inference Result"
// This is no longer the primary function, but kept for interface compliance
override suspend fun segmentImage(bitmap: Bitmap): Triple<Bitmap, BooleanArray, Rect>? {
return suspendCancellableCoroutine { cont ->
val image = InputImage.fromBitmap(bitmap, 0)
segmenter.process(image)
.addOnSuccessListener { result ->
val fg = result.foregroundBitmap ?: return@addOnSuccessListener cont.resume(null)
// Instead of coloring it here, just pass the original mask bitmap
// or ensure it's suitable for further processing.
// The foreground bitmap from MLKit is usually the object cut out with transparent background.
val booleanMask = createBooleanMask(fg)
// We return the raw foreground bitmap as the 'maskBitmap' for now,
// or a colorized version if that's what UI expects.
// But for IOU/Overlap calculation, we might want the binary info.
// The UI seems to overlay 'colorMask'.
// DistanceEstimator uses 'segMaskBitmap'.
val colorMask = createColorizedMask(fg)
val bbox = computeBoundingBox(booleanMask, fg.width, fg.height)
// Returning colorMask as the first element because UI expects a visual overlay.
// But note: DistanceEstimator might need the binary mask or the foreground.
// If DistanceEstimator treats this bitmap as a mask, colorized is fine as long as alpha is preserved.
cont.resume(Triple(colorMask, booleanMask, bbox))
}
.addOnFailureListener { e ->
cont.resumeWithException(e)
}
}
// Returning null as we are focusing on object detection now
return null
}
private fun createColorizedMask(maskBitmap: Bitmap): Bitmap {
val w = maskBitmap.width
val h = maskBitmap.height
val pixels = IntArray(w * h)
maskBitmap.getPixels(pixels, 0, w, 0, 0, w, h)
for (i in pixels.indices) {
// ML Kit Foreground Bitmap: Non-transparent pixels are the object.
if (Color.alpha(pixels[i]) > 0) {
pixels[i] = MASK_COLOR
}
}
return Bitmap.createBitmap(pixels, w, h, Bitmap.Config.ARGB_8888)
}
private fun createBooleanMask(bitmap: Bitmap): BooleanArray {
val w = bitmap.width
val h = bitmap.height
val mask = BooleanArray(w * h)
val pixels = IntArray(w * h)
bitmap.getPixels(pixels, 0, w, 0, 0, w, h)
for (i in pixels.indices) {
mask[i] = Color.alpha(pixels[i]) > 0
}
return mask
}
private fun computeBoundingBox(mask: BooleanArray, w: Int, h: Int): Rect {
var minX = Int.MAX_VALUE
var minY = Int.MAX_VALUE
var maxX = Int.MIN_VALUE
var maxY = Int.MIN_VALUE
for (y in 0 until h) {
for (x in 0 until w) {
val idx = y * w + x
if (mask[idx]) {
if (x < minX) minX = x
if (y < minY) minY = y
if (x > maxX) maxX = x
if (y > maxY) maxY = y
}
}
}
return if (minX == Int.MAX_VALUE) {
Rect(0, 0, 0, 0)
} else {
Rect(minX, minY, maxX, maxY)
}
}
override fun deriveInference(bitmap: Bitmap): String = "Object Detection"
}

View File

@ -0,0 +1,9 @@
package com.example.livingai.data.ml
import android.graphics.Rect
data class ObjectDetectionResult(
val boundingBox: Rect,
val label: String,
val confidence: Float
)

View File

@ -4,6 +4,7 @@ import android.content.ContentValues
import android.content.Context
import android.graphics.Bitmap
import android.graphics.Matrix
import android.graphics.Rect
import android.provider.MediaStore
import androidx.camera.core.ImageProxy
import com.example.livingai.data.ml.DistanceEstimatorImpl
@ -48,16 +49,69 @@ class CameraRepositoryImpl(
bitmap: Bitmap,
requestedOrientation: Orientation,
silhouetteBitmap: Bitmap,
realObjectHeightMeters: Float?, // ★ NEW PARAM
focalLengthPixels: Float // from camera intrinsics
realObjectHeightMeters: Float?,
focalLengthPixels: Float,
boundingBox: Rect?
): OrientationState = withContext(Dispatchers.Default) {
// 1. Collect segmentation
val meta = FrameMetadataProvider.collectMetadata(bitmap)
val bbox = meta.segmentationBox
val mask = meta.segmentationMaskBitmap
// Use the passed boundingBox if available, otherwise it relies on FrameMetadataProvider running segmentation again
// But FrameMetadataProvider.collectMetadata runs segmentation internally.
// To avoid re-running detection/segmentation if we already have bbox, we can pass it.
// However, FrameMetadataProvider currently calls getSegmentation(bitmap) which calls aiModel.segmentImage(bitmap).
// AIModel.segmentImage is returning null in current impl.
if (bbox == null || mask == null) {
// ISSUE: processFrame relies on FrameMetadataProvider.collectMetadata -> getSegmentation -> aiModel.segmentImage
// But AIModelImpl.segmentImage returns null!
// So bbox will be null, and processFrame returns early with "Segmentation missing".
// FIX: We need to use the detection result we already have from CameraViewModel.
// We will mock the segmentation result using the bounding box from object detection.
// And for the mask, since we don't have segmentation, we can either:
// a) Create a dummy mask filled within the bbox (simple box mask)
// b) Or just proceed if DistanceEstimator can handle it (it needs mask).
// Let's create a synthetic mask from the bbox.
val syntheticMeta = if (boundingBox != null) {
// Create a simple mask where pixels inside bbox are true
// This is computationally expensive to do full bitmap, so be careful.
// But we need a Bitmap mask for DistanceEstimator.
// Let's create a black bitmap with white rect.
// NOTE: This runs on Default dispatcher, so should be okay-ish.
// However, FrameMetadataProvider.collectMetadata does more (IMU, Depth).
// Let's manually construct metadata.
val maskBitmap = Bitmap.createBitmap(bitmap.width, bitmap.height, Bitmap.Config.ARGB_8888)
val canvas = android.graphics.Canvas(maskBitmap)
val paint = android.graphics.Paint().apply { color = android.graphics.Color.WHITE }
canvas.drawRect(boundingBox, paint)
val imu = FrameMetadataProvider.getIMU()
val rot = FrameMetadataProvider.getRotation()
val depth = FrameMetadataProvider.getDepthData()
FrameMetadataProvider.FrameCollectedMetadata(
segmentationMaskBitmap = maskBitmap,
segmentationBox = boundingBox,
depthMeters = depth.depthMeters,
depthWidth = depth.width,
depthHeight = depth.height,
depthConfidence = depth.confidence,
pitch = imu.pitch,
roll = imu.roll,
yaw = imu.yaw,
rotationDegrees = rot
)
} else {
FrameMetadataProvider.collectMetadata(bitmap)
}
val bbox = syntheticMeta.segmentationBox
// val mask = syntheticMeta.segmentationMaskBitmap // Mask is used inside distanceEstimator
if (bbox == null) {
return@withContext OrientationState(
success = false,
reason = "Segmentation missing",
@ -78,7 +132,7 @@ class CameraRepositoryImpl(
)
// 3. Build FrameData with relative depth only
val frameData = meta.toFrameData(bitmap).copy(
val frameData = syntheticMeta.toFrameData(bitmap).copy(
medianDepth = midasResult?.relativeDepth
)

View File

@ -111,7 +111,7 @@ val appModule = module {
}
// ML Model
single<AIModel> { AIModelImpl() }
single<AIModel> { AIModelImpl(androidContext()) }
single<ObjectDetector> {
ObjectDetectorImpl(
context = androidContext(),

View File

@ -2,8 +2,10 @@ package com.example.livingai.domain.ml
import android.graphics.Bitmap
import android.graphics.Rect
import com.example.livingai.data.ml.ObjectDetectionResult
interface AIModel {
fun deriveInference(bitmap: Bitmap): String
suspend fun segmentImage(bitmap: Bitmap): Triple<Bitmap, BooleanArray, Rect>?
suspend fun detectObject(bitmap: Bitmap): ObjectDetectionResult?
}

View File

@ -1,6 +1,7 @@
package com.example.livingai.domain.repository
import android.graphics.Bitmap
import android.graphics.Rect
import androidx.camera.core.ImageProxy
import com.example.livingai.domain.ml.Orientation
import com.example.livingai.domain.ml.OrientationState
@ -12,7 +13,8 @@ interface CameraRepository {
requestedOrientation: Orientation,
silhouetteBitmap: Bitmap,
realObjectHeightMeters: Float?,
focalLengthPixels: Float
focalLengthPixels: Float,
boundingBox: Rect? = null
): OrientationState
suspend fun saveImage(bitmap: Bitmap, animalId: String, orientation: String?): String
}

View File

@ -7,10 +7,14 @@ import androidx.camera.core.ImageProxy
import androidx.camera.view.LifecycleCameraController
import androidx.compose.foundation.Image
import androidx.compose.foundation.background
import androidx.compose.foundation.border
import androidx.compose.foundation.layout.Box
import androidx.compose.foundation.layout.BoxWithConstraints
import androidx.compose.foundation.layout.Column
import androidx.compose.foundation.layout.fillMaxSize
import androidx.compose.foundation.layout.offset
import androidx.compose.foundation.layout.padding
import androidx.compose.foundation.layout.size
import androidx.compose.foundation.shape.RoundedCornerShape
import androidx.compose.material.icons.Icons
import androidx.compose.material.icons.filled.Camera
@ -18,7 +22,6 @@ import androidx.compose.material3.CircularProgressIndicator
import androidx.compose.material3.FabPosition
import androidx.compose.material3.FloatingActionButton
import androidx.compose.material3.Icon
import androidx.compose.material3.MaterialTheme
import androidx.compose.material3.Scaffold
import androidx.compose.material3.Text
import androidx.compose.runtime.Composable
@ -32,6 +35,7 @@ import androidx.compose.ui.graphics.Color
import androidx.compose.ui.graphics.asImageBitmap
import androidx.compose.ui.layout.ContentScale
import androidx.compose.ui.platform.LocalContext
import androidx.compose.ui.platform.LocalDensity
import androidx.compose.ui.unit.dp
import androidx.navigation.NavController
import androidx.core.content.ContextCompat
@ -67,6 +71,7 @@ fun CameraScreen(
PermissionWrapper {
val state by viewModel.state.collectAsState()
val context = LocalContext.current
val density = LocalDensity.current
val controller = remember {
LifecycleCameraController(context).apply {
@ -83,31 +88,14 @@ fun CameraScreen(
viewModel.onEvent(CameraEvent.ImageCaptured(image))
}
override fun onError(exception: ImageCaptureException) {
// Handle error, e.g., log it or show a message
}
override fun onError(exception: ImageCaptureException) {}
}
)
}
LaunchedEffect(state.shouldAutoCapture) {
if (state.shouldAutoCapture) {
takePhoto()
viewModel.onEvent(CameraEvent.AutoCaptureTriggered)
}
}
LaunchedEffect(state.capturedImageUri) {
state.capturedImageUri?.let {
navController.navigate(
Route.ViewImageScreen(
imageUri = it.toString(),
shouldAllowRetake = true,
showAccept = true,
orientation = orientation,
animalId = animalId
)
)
navController.navigate(Route.ViewImageScreen(it.toString(), true, orientation, true, false, animalId))
viewModel.onEvent(CameraEvent.ClearCapturedImage)
}
}
@ -120,71 +108,95 @@ fun CameraScreen(
},
floatingActionButtonPosition = FabPosition.Center
) { paddingValues ->
Box(
modifier = Modifier.fillMaxSize(),
) {
Box(
modifier = Modifier.fillMaxSize()
) {
CameraPreview(
modifier = Modifier.fillMaxSize(),
controller = controller,
onFrame = { bitmap, rotation, fxPixels ->
viewModel.onEvent(CameraEvent.FrameReceived(bitmap, rotation, fxPixels))
}
BoxWithConstraints(modifier = Modifier.fillMaxSize().padding(paddingValues)) {
val screenWidth = maxWidth
val screenHeight = maxHeight
CameraPreview(
modifier = Modifier.fillMaxSize(),
controller = controller,
onFrame = { bitmap, rotation, fxPixels ->
viewModel.onEvent(CameraEvent.FrameReceived(bitmap, rotation, fxPixels))
}
)
state.detectionResult?.let { detection ->
val imageWidth = state.imageWidth.toFloat()
val imageHeight = state.imageHeight.toFloat()
if (imageWidth == 0f || imageHeight == 0f) return@let
val screenW = with(density) { screenWidth.toPx() }
val screenH = with(density) { screenHeight.toPx() }
val scaleX = screenW / imageWidth
val scaleY = screenH / imageHeight
val scale = maxOf(scaleX, scaleY) // For FILL_CENTER behavior
val offsetX = (screenW - imageWidth * scale) / 2f
val offsetY = (screenH - imageHeight * scale) / 2f
val bbox = detection.boundingBox
val left = bbox.left * scale + offsetX
val top = bbox.top * scale + offsetY
val leftDp = with(density) { left.toDp() }
val topDp = with(density) { top.toDp() }
val widthDp = with(density) { (bbox.width() * scale).toDp() }
val heightDp = with(density) { (bbox.height() * scale).toDp() }
Box(
modifier = Modifier
.offset(x = leftDp, y = topDp)
.size(width = widthDp, height = heightDp)
.border(2.dp, Color.Yellow)
)
// The ML segmentation mask
state.segmentationMask?.let { mask ->
Image(
bitmap = mask.asImageBitmap(),
contentDescription = "Segmentation Overlay",
modifier = Modifier.fillMaxSize(),
contentScale = ContentScale.FillBounds,
alpha = 0.5f
)
}
state.silhouetteMask?.let {
Image(
bitmap = it.asImageBitmap(),
contentDescription = "Silhouette Overlay",
modifier = Modifier.fillMaxSize(),
contentScale = ContentScale.Fit,
alpha = 0.4f
// Overlay 1: Object Label & Confidence (Above the box)
Column(
modifier = Modifier
.offset(x = leftDp, y = topDp - 25.dp)
.background(Color.Black.copy(alpha = 0.7f))
.padding(4.dp)
) {
Text(
text = "${detection.label} (${(detection.confidence * 100).toInt()}%)",
color = Color.White
)
}
}
// Debug Overlay
// Overlay 2: Fixed top-right corner info
state.orientationState?.let { orient ->
Box(
Column(
modifier = Modifier
.align(Alignment.TopEnd)
.padding(16.dp)
.background(Color.Black.copy(alpha = 0.5f), RoundedCornerShape(8.dp))
.background(Color.Black.copy(alpha = 0.7f), shape = RoundedCornerShape(8.dp))
.padding(8.dp)
) {
Column {
Text("Success: ${orient.success}", color = Color.White)
Text("Reason: ${orient.reason}", color = Color.White)
orient.pixelMetrics?.let { pm ->
Text("Width (px): ${pm.widthPx}", color = Color.White)
Text("Height (px): ${pm.heightPx}", color = Color.White)
}
// Display depth metrics from OrientationState
orient.relativeDepth?.let { rel ->
Text("Rel Depth: %.4f".format(rel), color = Color.White)
}
orient.absoluteDistanceMeters?.let { abs ->
Text("Dist (m): %.2f".format(abs), color = Color.White)
}
Text("IOU: ${orient.iouScore}", color = Color.White)
Text("Matched: ${orient.orientationMatched}", color = Color.White)
if (orient.relativeDepth != null) {
Text(
text = "Rel Depth: %.2f".format(orient.relativeDepth),
color = Color.Cyan
)
}
if (orient.absoluteDistanceMeters != null) {
Text(
text = "Dist: %.2fm".format(orient.absoluteDistanceMeters),
color = Color.Green
)
}
if (orient.iouScore != null) {
Text(
text = "IoU: %.2f".format(orient.iouScore),
color = Color.Yellow
)
}
orient.pixelMetrics?.let { metrics ->
Text(
text = "W: ${metrics.widthPx}px H: ${metrics.heightPx}px",
color = Color.White
)
}
}
}

View File

@ -6,6 +6,7 @@ import android.net.Uri
import androidx.camera.core.ImageProxy
import androidx.lifecycle.ViewModel
import androidx.lifecycle.viewModelScope
import com.example.livingai.data.ml.ObjectDetectionResult
import com.example.livingai.domain.ml.AIModel
import com.example.livingai.domain.ml.Orientation
import com.example.livingai.domain.ml.OrientationState
@ -72,7 +73,8 @@ class CameraViewModel(
private fun clearCaptured() {
_state.value = _state.value.copy(
capturedImageUri = null,
segmentationMask = null
segmentationMask = null,
detectionResult = null // Clear detection result as well
)
}
@ -89,80 +91,49 @@ class CameraViewModel(
}
private fun handleFrame(bitmap: Bitmap, rotationDegrees: Int, focalLengthPixels: Float) {
if (_state.value.isCapturing || _state.value.shouldAutoCapture) {
return
}
if (isProcessingFrame.compareAndSet(false, true)) {
viewModelScope.launch {
try {
val currentOrientationStr = _state.value.orientation
val silhouette = _state.value.savedMaskBitmap
val orientationState = if (currentOrientationStr != null && silhouette != null) {
val orientationEnum = mapStringToOrientation(currentOrientationStr)
cameraRepository.processFrame(
bitmap,
orientationEnum,
silhouette,
1.55f,
focalLengthPixels
)
// Rotate bitmap to be upright before processing
val rotatedBitmap = if (rotationDegrees != 0) {
val matrix = Matrix().apply { postRotate(rotationDegrees.toFloat()) }
Bitmap.createBitmap(bitmap, 0, 0, bitmap.width, bitmap.height, matrix, true)
} else {
null
bitmap
}
val result = aiModel.segmentImage(bitmap)
if (result != null) {
val (maskBitmap, _) = result
// Perform Object Detection
val detectionResult = aiModel.detectObject(rotatedBitmap)
val rotatedMask = if (rotationDegrees != 0) {
val matrix = Matrix().apply { postRotate(rotationDegrees.toFloat()) }
Bitmap.createBitmap(
maskBitmap,
0,
0,
maskBitmap.width,
maskBitmap.height,
matrix,
true
)
} else {
maskBitmap
}
var orientationState: OrientationState? = null
val requestedOrientationStr = _state.value.orientation
val output = if(_state.value.orientation == "front" || _state.value.orientation == "back")
fitImageToCrop(rotatedMask, screenDims.screenWidth, screenDims.screenHeight)
else
fitImageToCrop(rotatedMask, screenDims.screenHeight, screenDims.screenWidth)
if (requestedOrientationStr != null && detectionResult != null) {
// We need a silhouette bitmap for processFrame. If not available, we can pass a dummy or handle inside.
// But for now, let's use the one we loaded in setContext
val silhouette = _state.value.silhouetteMask
_state.value = _state.value.copy(
segmentationMask = output,
orientationState = orientationState
)
if (_state.value.isAutoCaptureEnabled &&
_state.value.savedMaskBitmap != null &&
output != null
) {
val isValidCapture = calculateDistance(
_state.value.distanceMethod,
_state.value.savedMaskBitmap!!,
output,
_state.value.matchThreshold
)
if (isValidCapture) {
_state.value = _state.value.copy(shouldAutoCapture = true)
}
}
} else {
_state.value = _state.value.copy(
segmentationMask = null,
orientationState = orientationState
)
if (silhouette != null) {
orientationState = cameraRepository.processFrame(
bitmap = rotatedBitmap,
requestedOrientation = mapStringToOrientation(requestedOrientationStr),
silhouetteBitmap = silhouette,
realObjectHeightMeters = null, // Or some default
focalLengthPixels = focalLengthPixels,
boundingBox = detectionResult.boundingBox // Pass the bbox we just found
)
}
}
_state.value = _state.value.copy(
detectionResult = detectionResult,
orientationState = orientationState, // Update state
imageWidth = rotatedBitmap.width,
imageHeight = rotatedBitmap.height
)
} catch (e: Exception) {
e.printStackTrace()
} finally {
isProcessingFrame.set(false)
}
@ -195,7 +166,10 @@ data class CameraUiState(
val matchThreshold: Int = 50,
val distanceMethod: String = "Jaccard",
val shouldAutoCapture: Boolean = false,
val orientationState: OrientationState? = null
val orientationState: OrientationState? = null,
val detectionResult: ObjectDetectionResult? = null,
val imageWidth: Int = 0,
val imageHeight: Int = 0
)
sealed class CameraEvent {