commit 53131e903f43fb0522aac89ea649898e9ce61919
Author: Rasul <RasulGaleev2005@yandex.ru>
Date:   Tue May 6 15:25:37 2025 +0300

    init

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d83c492
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,172 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+!hy3dgen/texgen/custom_rasterizer/lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+.DS_Store
+# Cython debug symbols
+cython_debug/
+gradio_cache/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea
+#Docs
+*.md
+*.pdf
diff --git a/api_server.py b/api_server.py
new file mode 100644
index 0000000..dd2895b
--- /dev/null
+++ b/api_server.py
@@ -0,0 +1,316 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+"""
+A model worker executes the model.
+"""
+import argparse
+import asyncio
+import base64
+import logging
+import logging.handlers
+import os
+import sys
+import tempfile
+import threading
+import traceback
+import uuid
+from io import BytesIO
+
+import torch
+import trimesh
+import uvicorn
+from PIL import Image
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, FileResponse
+
+from hy3dgen.rembg import BackgroundRemover
+from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline, FloaterRemover, DegenerateFaceRemover, FaceReducer, \
+    MeshSimplifier
+from hy3dgen.texgen import Hunyuan3DPaintPipeline
+from hy3dgen.text2image import HunyuanDiTPipeline
+
+LOGDIR = '.'
+
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
+
+handler = None
+
+
+def build_logger(logger_name, logger_filename):
+    global handler
+
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when='D', utc=True, encoding='UTF-8')
+        handler.setFormatter(formatter)
+
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+
+    return logger
+
+
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ''
+
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ''
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == '\n':
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+
+    def flush(self):
+        if self.linebuf != '':
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ''
+
+
+def pretty_print_semaphore(semaphore):
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
+
+
+SAVE_DIR = 'gradio_cache'
+os.makedirs(SAVE_DIR, exist_ok=True)
+
+worker_id = str(uuid.uuid4())[:6]
+logger = build_logger("controller", f"{SAVE_DIR}/controller.log")
+
+
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+
+
+class ModelWorker:
+    def __init__(self,
+                 model_path='tencent/Hunyuan3D-2mini',
+                 tex_model_path='tencent/Hunyuan3D-2',
+                 subfolder='hunyuan3d-dit-v2-mini-turbo',
+                 device='cuda',
+                 enable_tex=False):
+        self.model_path = model_path
+        self.worker_id = worker_id
+        self.device = device
+        logger.info(f"Loading the model {model_path} on worker {worker_id} ...")
+
+        self.rembg = BackgroundRemover()
+        self.pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
+            model_path,
+            subfolder=subfolder,
+            use_safetensors=True,
+            device=device,
+        )
+        self.pipeline.enable_flashvdm(mc_algo='mc')
+        # self.pipeline_t2i = HunyuanDiTPipeline(
+        #     'Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled',
+        #     device=device
+        # )
+        if enable_tex:
+            self.pipeline_tex = Hunyuan3DPaintPipeline.from_pretrained(tex_model_path)
+
+    def get_queue_length(self):
+        if model_semaphore is None:
+            return 0
+        else:
+            return args.limit_model_concurrency - model_semaphore._value + (len(
+                model_semaphore._waiters) if model_semaphore._waiters is not None else 0)
+
+    def get_status(self):
+        return {
+            "speed": 1,
+            "queue_length": self.get_queue_length(),
+        }
+
+    @torch.inference_mode()
+    def generate(self, uid, params):
+        if 'image' in params:
+            image = params["image"]
+            image = load_image_from_base64(image)
+        else:
+            if 'text' in params:
+                text = params["text"]
+                image = self.pipeline_t2i(text)
+            else:
+                raise ValueError("No input image or text provided")
+
+        image = self.rembg(image)
+        params['image'] = image
+
+        if 'mesh' in params:
+            mesh = trimesh.load(BytesIO(base64.b64decode(params["mesh"])), file_type='glb')
+        else:
+            seed = params.get("seed", 1234)
+            params['generator'] = torch.Generator(self.device).manual_seed(seed)
+            params['octree_resolution'] = params.get("octree_resolution", 128)
+            params['num_inference_steps'] = params.get("num_inference_steps", 5)
+            params['guidance_scale'] = params.get('guidance_scale', 5.0)
+            params['mc_algo'] = 'mc'
+            import time
+            start_time = time.time()
+            mesh = self.pipeline(**params)[0]
+            logger.info("--- %s seconds ---" % (time.time() - start_time))
+
+        if params.get('texture', False):
+            mesh = FloaterRemover()(mesh)
+            mesh = DegenerateFaceRemover()(mesh)
+            mesh = FaceReducer()(mesh, max_facenum=params.get('face_count', 40000))
+            mesh = self.pipeline_tex(mesh, image)
+
+        type = params.get('type', 'glb')
+        with tempfile.NamedTemporaryFile(suffix=f'.{type}', delete=False) as temp_file:
+            mesh.export(temp_file.name)
+            mesh = trimesh.load(temp_file.name)
+            save_path = os.path.join(SAVE_DIR, f'{str(uid)}.{type}')
+            mesh.export(save_path)
+
+        torch.cuda.empty_cache()
+        return save_path, uid
+
+
+app = FastAPI()
+from fastapi.middleware.cors import CORSMiddleware
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # 你可以指定允许的来源
+    allow_credentials=True,
+    allow_methods=["*"],  # 允许所有方法
+    allow_headers=["*"],  # 允许所有头部
+)
+
+
+@app.post("/generate")
+async def generate(request: Request):
+    logger.info("Worker generating...")
+    params = await request.json()
+    uid = uuid.uuid4()
+    try:
+        file_path, uid = worker.generate(uid, params)
+        return FileResponse(file_path)
+    except ValueError as e:
+        traceback.print_exc()
+        print("Caught ValueError:", e)
+        ret = {
+            "text": server_error_msg,
+            "error_code": 1,
+        }
+        return JSONResponse(ret, status_code=404)
+    except torch.cuda.CudaError as e:
+        print("Caught torch.cuda.CudaError:", e)
+        ret = {
+            "text": server_error_msg,
+            "error_code": 1,
+        }
+        return JSONResponse(ret, status_code=404)
+    except Exception as e:
+        print("Caught Unknown Error", e)
+        traceback.print_exc()
+        ret = {
+            "text": server_error_msg,
+            "error_code": 1,
+        }
+        return JSONResponse(ret, status_code=404)
+
+
+@app.post("/send")
+async def generate(request: Request):
+    logger.info("Worker send...")
+    params = await request.json()
+    uid = uuid.uuid4()
+    threading.Thread(target=worker.generate, args=(uid, params,)).start()
+    ret = {"uid": str(uid)}
+    return JSONResponse(ret, status_code=200)
+
+
+@app.get("/status/{uid}")
+async def status(uid: str):
+    save_file_path = os.path.join(SAVE_DIR, f'{uid}.glb')
+    print(save_file_path, os.path.exists(save_file_path))
+    if not os.path.exists(save_file_path):
+        response = {'status': 'processing'}
+        return JSONResponse(response, status_code=200)
+    else:
+        base64_str = base64.b64encode(open(save_file_path, 'rb').read()).decode()
+        response = {'status': 'completed', 'model_base64': base64_str}
+        return JSONResponse(response, status_code=200)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=8081)
+    parser.add_argument("--model_path", type=str, default='tencent/Hunyuan3D-2mini')
+    parser.add_argument("--tex_model_path", type=str, default='tencent/Hunyuan3D-2')
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--limit-model-concurrency", type=int, default=5)
+    parser.add_argument('--enable_tex', action='store_true')
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+
+    model_semaphore = asyncio.Semaphore(args.limit_model_concurrency)
+
+    worker = ModelWorker(model_path=args.model_path, device=args.device, enable_tex=args.enable_tex,
+                         tex_model_path=args.tex_model_path)
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
diff --git a/assets/1.glb b/assets/1.glb
new file mode 100644
index 0000000..0d33b00
Binary files /dev/null and b/assets/1.glb differ
diff --git a/assets/demo.png b/assets/demo.png
new file mode 100644
index 0000000..00fda1d
Binary files /dev/null and b/assets/demo.png differ
diff --git a/assets/env_maps/gradient.jpg b/assets/env_maps/gradient.jpg
new file mode 100644
index 0000000..55546c1
Binary files /dev/null and b/assets/env_maps/gradient.jpg differ
diff --git a/assets/env_maps/white.jpg b/assets/env_maps/white.jpg
new file mode 100644
index 0000000..f7af123
Binary files /dev/null and b/assets/env_maps/white.jpg differ
diff --git a/assets/example_images/004.png b/assets/example_images/004.png
new file mode 100644
index 0000000..95eb0da
Binary files /dev/null and b/assets/example_images/004.png differ
diff --git a/assets/example_images/052.png b/assets/example_images/052.png
new file mode 100644
index 0000000..685ef05
Binary files /dev/null and b/assets/example_images/052.png differ
diff --git a/assets/example_images/073.png b/assets/example_images/073.png
new file mode 100644
index 0000000..0390125
Binary files /dev/null and b/assets/example_images/073.png differ
diff --git a/assets/example_images/075.png b/assets/example_images/075.png
new file mode 100644
index 0000000..1381f75
Binary files /dev/null and b/assets/example_images/075.png differ
diff --git a/assets/example_images/1008.png b/assets/example_images/1008.png
new file mode 100644
index 0000000..473c933
Binary files /dev/null and b/assets/example_images/1008.png differ
diff --git a/assets/example_images/101.png b/assets/example_images/101.png
new file mode 100644
index 0000000..b0c5875
Binary files /dev/null and b/assets/example_images/101.png differ
diff --git a/assets/example_images/1022.png b/assets/example_images/1022.png
new file mode 100644
index 0000000..033fc3d
Binary files /dev/null and b/assets/example_images/1022.png differ
diff --git a/assets/example_images/1029.png b/assets/example_images/1029.png
new file mode 100644
index 0000000..d3b13cd
Binary files /dev/null and b/assets/example_images/1029.png differ
diff --git a/assets/example_images/1037.png b/assets/example_images/1037.png
new file mode 100644
index 0000000..e2ac72e
Binary files /dev/null and b/assets/example_images/1037.png differ
diff --git a/assets/example_images/1079.png b/assets/example_images/1079.png
new file mode 100644
index 0000000..0398f6b
Binary files /dev/null and b/assets/example_images/1079.png differ
diff --git a/assets/example_images/1111.png b/assets/example_images/1111.png
new file mode 100644
index 0000000..ea24af1
Binary files /dev/null and b/assets/example_images/1111.png differ
diff --git a/assets/example_images/1123.png b/assets/example_images/1123.png
new file mode 100644
index 0000000..71e862b
Binary files /dev/null and b/assets/example_images/1123.png differ
diff --git a/assets/example_images/1128.png b/assets/example_images/1128.png
new file mode 100644
index 0000000..f04d30d
Binary files /dev/null and b/assets/example_images/1128.png differ
diff --git a/assets/example_images/1135.png b/assets/example_images/1135.png
new file mode 100644
index 0000000..e4242dc
Binary files /dev/null and b/assets/example_images/1135.png differ
diff --git a/assets/example_images/1146.png b/assets/example_images/1146.png
new file mode 100644
index 0000000..d9541b6
Binary files /dev/null and b/assets/example_images/1146.png differ
diff --git a/assets/example_images/1148.png b/assets/example_images/1148.png
new file mode 100644
index 0000000..2d76c9e
Binary files /dev/null and b/assets/example_images/1148.png differ
diff --git a/assets/example_images/1154.png b/assets/example_images/1154.png
new file mode 100644
index 0000000..2ab169c
Binary files /dev/null and b/assets/example_images/1154.png differ
diff --git a/assets/example_images/1180.png b/assets/example_images/1180.png
new file mode 100644
index 0000000..1bf552b
Binary files /dev/null and b/assets/example_images/1180.png differ
diff --git a/assets/example_images/1196.png b/assets/example_images/1196.png
new file mode 100644
index 0000000..c7c4e9d
Binary files /dev/null and b/assets/example_images/1196.png differ
diff --git a/assets/example_images/1204.png b/assets/example_images/1204.png
new file mode 100644
index 0000000..569dd9e
Binary files /dev/null and b/assets/example_images/1204.png differ
diff --git a/assets/example_images/1234.png b/assets/example_images/1234.png
new file mode 100644
index 0000000..105a9a3
Binary files /dev/null and b/assets/example_images/1234.png differ
diff --git a/assets/example_images/1310.png b/assets/example_images/1310.png
new file mode 100644
index 0000000..42f5fdd
Binary files /dev/null and b/assets/example_images/1310.png differ
diff --git a/assets/example_images/1316.png b/assets/example_images/1316.png
new file mode 100644
index 0000000..a3f2902
Binary files /dev/null and b/assets/example_images/1316.png differ
diff --git a/assets/example_images/1354.png b/assets/example_images/1354.png
new file mode 100644
index 0000000..685c11c
Binary files /dev/null and b/assets/example_images/1354.png differ
diff --git a/assets/example_images/1429.png b/assets/example_images/1429.png
new file mode 100644
index 0000000..976d1a4
Binary files /dev/null and b/assets/example_images/1429.png differ
diff --git a/assets/example_images/1493.png b/assets/example_images/1493.png
new file mode 100644
index 0000000..dd1e979
Binary files /dev/null and b/assets/example_images/1493.png differ
diff --git a/assets/example_images/1582.png b/assets/example_images/1582.png
new file mode 100644
index 0000000..e67ed66
Binary files /dev/null and b/assets/example_images/1582.png differ
diff --git a/assets/example_images/1583.png b/assets/example_images/1583.png
new file mode 100644
index 0000000..01f6a48
Binary files /dev/null and b/assets/example_images/1583.png differ
diff --git a/assets/example_images/1596.png b/assets/example_images/1596.png
new file mode 100644
index 0000000..55d3970
Binary files /dev/null and b/assets/example_images/1596.png differ
diff --git a/assets/example_images/1601.png b/assets/example_images/1601.png
new file mode 100644
index 0000000..e3bdbbd
Binary files /dev/null and b/assets/example_images/1601.png differ
diff --git a/assets/example_images/1603.png b/assets/example_images/1603.png
new file mode 100644
index 0000000..8f2eb53
Binary files /dev/null and b/assets/example_images/1603.png differ
diff --git a/assets/example_images/1626.png b/assets/example_images/1626.png
new file mode 100644
index 0000000..faa2f73
Binary files /dev/null and b/assets/example_images/1626.png differ
diff --git a/assets/example_images/1627.png b/assets/example_images/1627.png
new file mode 100644
index 0000000..000c9ab
Binary files /dev/null and b/assets/example_images/1627.png differ
diff --git a/assets/example_images/1654.png b/assets/example_images/1654.png
new file mode 100644
index 0000000..2385031
Binary files /dev/null and b/assets/example_images/1654.png differ
diff --git a/assets/example_images/167.png b/assets/example_images/167.png
new file mode 100644
index 0000000..ab59d39
Binary files /dev/null and b/assets/example_images/167.png differ
diff --git a/assets/example_images/1670.png b/assets/example_images/1670.png
new file mode 100644
index 0000000..c6d7157
Binary files /dev/null and b/assets/example_images/1670.png differ
diff --git a/assets/example_images/1679.png b/assets/example_images/1679.png
new file mode 100644
index 0000000..ce14585
Binary files /dev/null and b/assets/example_images/1679.png differ
diff --git a/assets/example_images/1687.png b/assets/example_images/1687.png
new file mode 100644
index 0000000..90d406c
Binary files /dev/null and b/assets/example_images/1687.png differ
diff --git a/assets/example_images/1698.png b/assets/example_images/1698.png
new file mode 100644
index 0000000..91e7032
Binary files /dev/null and b/assets/example_images/1698.png differ
diff --git a/assets/example_images/1715.png b/assets/example_images/1715.png
new file mode 100644
index 0000000..2ee44da
Binary files /dev/null and b/assets/example_images/1715.png differ
diff --git a/assets/example_images/1735.png b/assets/example_images/1735.png
new file mode 100644
index 0000000..a7a722c
Binary files /dev/null and b/assets/example_images/1735.png differ
diff --git a/assets/example_images/1738.png b/assets/example_images/1738.png
new file mode 100644
index 0000000..50d4020
Binary files /dev/null and b/assets/example_images/1738.png differ
diff --git a/assets/example_images/1744.png b/assets/example_images/1744.png
new file mode 100644
index 0000000..767f820
Binary files /dev/null and b/assets/example_images/1744.png differ
diff --git a/assets/example_images/1758.png b/assets/example_images/1758.png
new file mode 100644
index 0000000..aafb219
Binary files /dev/null and b/assets/example_images/1758.png differ
diff --git a/assets/example_images/1772.png b/assets/example_images/1772.png
new file mode 100644
index 0000000..17bcfff
Binary files /dev/null and b/assets/example_images/1772.png differ
diff --git a/assets/example_images/1773.png b/assets/example_images/1773.png
new file mode 100644
index 0000000..1ffc05d
Binary files /dev/null and b/assets/example_images/1773.png differ
diff --git a/assets/example_images/1778.png b/assets/example_images/1778.png
new file mode 100644
index 0000000..4819163
Binary files /dev/null and b/assets/example_images/1778.png differ
diff --git a/assets/example_images/179.png b/assets/example_images/179.png
new file mode 100644
index 0000000..a2d4160
Binary files /dev/null and b/assets/example_images/179.png differ
diff --git a/assets/example_images/1898.png b/assets/example_images/1898.png
new file mode 100644
index 0000000..af43116
Binary files /dev/null and b/assets/example_images/1898.png differ
diff --git a/assets/example_images/191.png b/assets/example_images/191.png
new file mode 100644
index 0000000..0d342eb
Binary files /dev/null and b/assets/example_images/191.png differ
diff --git a/assets/example_images/195.png b/assets/example_images/195.png
new file mode 100644
index 0000000..530444f
Binary files /dev/null and b/assets/example_images/195.png differ
diff --git a/assets/example_images/197.png b/assets/example_images/197.png
new file mode 100644
index 0000000..b23b422
Binary files /dev/null and b/assets/example_images/197.png differ
diff --git a/assets/example_images/198.png b/assets/example_images/198.png
new file mode 100644
index 0000000..3215f00
Binary files /dev/null and b/assets/example_images/198.png differ
diff --git a/assets/example_images/202.png b/assets/example_images/202.png
new file mode 100644
index 0000000..a1389c8
Binary files /dev/null and b/assets/example_images/202.png differ
diff --git a/assets/example_images/203.png b/assets/example_images/203.png
new file mode 100644
index 0000000..a45b38e
Binary files /dev/null and b/assets/example_images/203.png differ
diff --git a/assets/example_images/218.png b/assets/example_images/218.png
new file mode 100644
index 0000000..f8f9b29
Binary files /dev/null and b/assets/example_images/218.png differ
diff --git a/assets/example_images/219.png b/assets/example_images/219.png
new file mode 100644
index 0000000..61369cf
Binary files /dev/null and b/assets/example_images/219.png differ
diff --git a/assets/example_images/379.png b/assets/example_images/379.png
new file mode 100644
index 0000000..0728d83
Binary files /dev/null and b/assets/example_images/379.png differ
diff --git a/assets/example_images/380.png b/assets/example_images/380.png
new file mode 100644
index 0000000..084a4c7
Binary files /dev/null and b/assets/example_images/380.png differ
diff --git a/assets/example_images/419.png b/assets/example_images/419.png
new file mode 100644
index 0000000..8e7cec9
Binary files /dev/null and b/assets/example_images/419.png differ
diff --git a/assets/example_images/583.png b/assets/example_images/583.png
new file mode 100644
index 0000000..c303211
Binary files /dev/null and b/assets/example_images/583.png differ
diff --git a/assets/example_images/888.png b/assets/example_images/888.png
new file mode 100644
index 0000000..185a4ce
Binary files /dev/null and b/assets/example_images/888.png differ
diff --git a/assets/example_images/895.png b/assets/example_images/895.png
new file mode 100644
index 0000000..cf29d13
Binary files /dev/null and b/assets/example_images/895.png differ
diff --git a/assets/example_images/example_000.png b/assets/example_images/example_000.png
new file mode 100644
index 0000000..6222237
Binary files /dev/null and b/assets/example_images/example_000.png differ
diff --git a/assets/example_images/example_002.png b/assets/example_images/example_002.png
new file mode 100644
index 0000000..a6fd2a5
Binary files /dev/null and b/assets/example_images/example_002.png differ
diff --git a/assets/example_mv_images/1/back.png b/assets/example_mv_images/1/back.png
new file mode 100644
index 0000000..b4e0509
Binary files /dev/null and b/assets/example_mv_images/1/back.png differ
diff --git a/assets/example_mv_images/1/front.png b/assets/example_mv_images/1/front.png
new file mode 100644
index 0000000..1417f8c
Binary files /dev/null and b/assets/example_mv_images/1/front.png differ
diff --git a/assets/example_mv_images/1/left.png b/assets/example_mv_images/1/left.png
new file mode 100644
index 0000000..ba76d31
Binary files /dev/null and b/assets/example_mv_images/1/left.png differ
diff --git a/assets/example_mv_images/10/back.png b/assets/example_mv_images/10/back.png
new file mode 100644
index 0000000..eef6ab0
Binary files /dev/null and b/assets/example_mv_images/10/back.png differ
diff --git a/assets/example_mv_images/10/front.png b/assets/example_mv_images/10/front.png
new file mode 100644
index 0000000..dda89b7
Binary files /dev/null and b/assets/example_mv_images/10/front.png differ
diff --git a/assets/example_mv_images/10/left.png b/assets/example_mv_images/10/left.png
new file mode 100644
index 0000000..e0579df
Binary files /dev/null and b/assets/example_mv_images/10/left.png differ
diff --git a/assets/example_mv_images/11/back.png b/assets/example_mv_images/11/back.png
new file mode 100644
index 0000000..b586caf
Binary files /dev/null and b/assets/example_mv_images/11/back.png differ
diff --git a/assets/example_mv_images/11/front.png b/assets/example_mv_images/11/front.png
new file mode 100644
index 0000000..595f9d6
Binary files /dev/null and b/assets/example_mv_images/11/front.png differ
diff --git a/assets/example_mv_images/11/left.png b/assets/example_mv_images/11/left.png
new file mode 100644
index 0000000..e83eccf
Binary files /dev/null and b/assets/example_mv_images/11/left.png differ
diff --git a/assets/example_mv_images/12/back.png b/assets/example_mv_images/12/back.png
new file mode 100644
index 0000000..c49e0fc
Binary files /dev/null and b/assets/example_mv_images/12/back.png differ
diff --git a/assets/example_mv_images/12/front.png b/assets/example_mv_images/12/front.png
new file mode 100644
index 0000000..148cd51
Binary files /dev/null and b/assets/example_mv_images/12/front.png differ
diff --git a/assets/example_mv_images/12/left.png b/assets/example_mv_images/12/left.png
new file mode 100644
index 0000000..4fbdb35
Binary files /dev/null and b/assets/example_mv_images/12/left.png differ
diff --git a/assets/example_mv_images/13/back.png b/assets/example_mv_images/13/back.png
new file mode 100644
index 0000000..26685f5
Binary files /dev/null and b/assets/example_mv_images/13/back.png differ
diff --git a/assets/example_mv_images/13/front.png b/assets/example_mv_images/13/front.png
new file mode 100644
index 0000000..95053ac
Binary files /dev/null and b/assets/example_mv_images/13/front.png differ
diff --git a/assets/example_mv_images/13/left.png b/assets/example_mv_images/13/left.png
new file mode 100644
index 0000000..34fe663
Binary files /dev/null and b/assets/example_mv_images/13/left.png differ
diff --git a/assets/example_mv_images/14/back.png b/assets/example_mv_images/14/back.png
new file mode 100644
index 0000000..1a48313
Binary files /dev/null and b/assets/example_mv_images/14/back.png differ
diff --git a/assets/example_mv_images/14/front.png b/assets/example_mv_images/14/front.png
new file mode 100644
index 0000000..3b58dfa
Binary files /dev/null and b/assets/example_mv_images/14/front.png differ
diff --git a/assets/example_mv_images/14/left.png b/assets/example_mv_images/14/left.png
new file mode 100644
index 0000000..9842b3a
Binary files /dev/null and b/assets/example_mv_images/14/left.png differ
diff --git a/assets/example_mv_images/2/back.png b/assets/example_mv_images/2/back.png
new file mode 100644
index 0000000..88a0513
Binary files /dev/null and b/assets/example_mv_images/2/back.png differ
diff --git a/assets/example_mv_images/2/front.png b/assets/example_mv_images/2/front.png
new file mode 100644
index 0000000..35c55ba
Binary files /dev/null and b/assets/example_mv_images/2/front.png differ
diff --git a/assets/example_mv_images/2/left.png b/assets/example_mv_images/2/left.png
new file mode 100644
index 0000000..bd47d66
Binary files /dev/null and b/assets/example_mv_images/2/left.png differ
diff --git a/assets/example_mv_images/3/back.png b/assets/example_mv_images/3/back.png
new file mode 100644
index 0000000..98185fe
Binary files /dev/null and b/assets/example_mv_images/3/back.png differ
diff --git a/assets/example_mv_images/3/front.png b/assets/example_mv_images/3/front.png
new file mode 100644
index 0000000..1265af6
Binary files /dev/null and b/assets/example_mv_images/3/front.png differ
diff --git a/assets/example_mv_images/3/left.png b/assets/example_mv_images/3/left.png
new file mode 100644
index 0000000..df83c19
Binary files /dev/null and b/assets/example_mv_images/3/left.png differ
diff --git a/assets/example_mv_images/4/back.png b/assets/example_mv_images/4/back.png
new file mode 100644
index 0000000..c818617
Binary files /dev/null and b/assets/example_mv_images/4/back.png differ
diff --git a/assets/example_mv_images/4/front.png b/assets/example_mv_images/4/front.png
new file mode 100644
index 0000000..8758fd6
Binary files /dev/null and b/assets/example_mv_images/4/front.png differ
diff --git a/assets/example_mv_images/4/left.png b/assets/example_mv_images/4/left.png
new file mode 100644
index 0000000..584be7f
Binary files /dev/null and b/assets/example_mv_images/4/left.png differ
diff --git a/assets/example_mv_images/5/back.png b/assets/example_mv_images/5/back.png
new file mode 100644
index 0000000..71e53e1
Binary files /dev/null and b/assets/example_mv_images/5/back.png differ
diff --git a/assets/example_mv_images/5/front.png b/assets/example_mv_images/5/front.png
new file mode 100644
index 0000000..041f4ac
Binary files /dev/null and b/assets/example_mv_images/5/front.png differ
diff --git a/assets/example_mv_images/5/left.png b/assets/example_mv_images/5/left.png
new file mode 100644
index 0000000..2337b26
Binary files /dev/null and b/assets/example_mv_images/5/left.png differ
diff --git a/assets/example_mv_images/6/back.png b/assets/example_mv_images/6/back.png
new file mode 100644
index 0000000..6ceb5d8
Binary files /dev/null and b/assets/example_mv_images/6/back.png differ
diff --git a/assets/example_mv_images/6/front.png b/assets/example_mv_images/6/front.png
new file mode 100644
index 0000000..95fc2c0
Binary files /dev/null and b/assets/example_mv_images/6/front.png differ
diff --git a/assets/example_mv_images/6/left.png b/assets/example_mv_images/6/left.png
new file mode 100644
index 0000000..944a731
Binary files /dev/null and b/assets/example_mv_images/6/left.png differ
diff --git a/assets/example_mv_images/7/back.png b/assets/example_mv_images/7/back.png
new file mode 100644
index 0000000..5ef772d
Binary files /dev/null and b/assets/example_mv_images/7/back.png differ
diff --git a/assets/example_mv_images/7/front.png b/assets/example_mv_images/7/front.png
new file mode 100644
index 0000000..01b20d8
Binary files /dev/null and b/assets/example_mv_images/7/front.png differ
diff --git a/assets/example_mv_images/7/left.png b/assets/example_mv_images/7/left.png
new file mode 100644
index 0000000..bfa778a
Binary files /dev/null and b/assets/example_mv_images/7/left.png differ
diff --git a/assets/example_mv_images/8/back.png b/assets/example_mv_images/8/back.png
new file mode 100644
index 0000000..d1d6b9d
Binary files /dev/null and b/assets/example_mv_images/8/back.png differ
diff --git a/assets/example_mv_images/8/front.png b/assets/example_mv_images/8/front.png
new file mode 100644
index 0000000..9e3c6d8
Binary files /dev/null and b/assets/example_mv_images/8/front.png differ
diff --git a/assets/example_mv_images/8/left.png b/assets/example_mv_images/8/left.png
new file mode 100644
index 0000000..2aeb68a
Binary files /dev/null and b/assets/example_mv_images/8/left.png differ
diff --git a/assets/example_mv_images/9/back.png b/assets/example_mv_images/9/back.png
new file mode 100644
index 0000000..e35be4b
Binary files /dev/null and b/assets/example_mv_images/9/back.png differ
diff --git a/assets/example_mv_images/9/front.png b/assets/example_mv_images/9/front.png
new file mode 100644
index 0000000..c73d819
Binary files /dev/null and b/assets/example_mv_images/9/front.png differ
diff --git a/assets/example_mv_images/9/left.png b/assets/example_mv_images/9/left.png
new file mode 100644
index 0000000..4a2736c
Binary files /dev/null and b/assets/example_mv_images/9/left.png differ
diff --git a/assets/example_prompts.txt b/assets/example_prompts.txt
new file mode 100644
index 0000000..5155022
--- /dev/null
+++ b/assets/example_prompts.txt
@@ -0,0 +1,5 @@
+一片绿色的树叶在白色背景上居中展现，清晰的纹理
+一只棕白相间的仓鼠，站在白色背景前。照片采用居中构图方式，卡通风格
+一盆绿色植物生长在红色花盆中，居中，写实
+a pot of green plants grows in a red flower pot.
+a lovely rabbit eating carrots
diff --git a/assets/images/arch.jpg b/assets/images/arch.jpg
new file mode 100644
index 0000000..c2e608a
Binary files /dev/null and b/assets/images/arch.jpg differ
diff --git a/assets/images/e2e-1.gif b/assets/images/e2e-1.gif
new file mode 100644
index 0000000..a79e112
Binary files /dev/null and b/assets/images/e2e-1.gif differ
diff --git a/assets/images/e2e-2.gif b/assets/images/e2e-2.gif
new file mode 100644
index 0000000..1653f8a
Binary files /dev/null and b/assets/images/e2e-2.gif differ
diff --git a/assets/images/system.jpg b/assets/images/system.jpg
new file mode 100644
index 0000000..8ab500b
Binary files /dev/null and b/assets/images/system.jpg differ
diff --git a/assets/images/teaser.jpg b/assets/images/teaser.jpg
new file mode 100644
index 0000000..6992a0d
Binary files /dev/null and b/assets/images/teaser.jpg differ
diff --git a/assets/images/teaser_wo_logo.jpg b/assets/images/teaser_wo_logo.jpg
new file mode 100644
index 0000000..ee724da
Binary files /dev/null and b/assets/images/teaser_wo_logo.jpg differ
diff --git a/assets/modelviewer-template.html b/assets/modelviewer-template.html
new file mode 100644
index 0000000..3406cb1
--- /dev/null
+++ b/assets/modelviewer-template.html
@@ -0,0 +1,81 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+    <!-- Import the component -->
+    <script src="https://cdn.jsdelivr.net/npm/@google/model-viewer@3.1.1/dist/model-viewer.min.js" type="module"></script>
+
+    <script>
+        document.addEventListener('DOMContentLoaded', () => {
+            const modelViewers = document.querySelectorAll('model-viewer');
+            const isSafari = /^((?!chrome|android).)*safari/i.test(navigator.userAgent);
+
+            modelViewers.forEach(modelViewer => {
+                modelViewer.setAttribute(
+                    "environment-image",
+                    "/static/env_maps/gradient.jpg"
+                );
+                // if (!isSafari) {
+                //     modelViewer.setAttribute(
+                //         "environment-image",
+                //         "/static/env_maps/gradient.jpg"
+                //     );
+                // } else {
+                //     modelViewer.addEventListener('load', (event) => {
+                //         const [material] = modelViewer.model.materials;
+                //         let color = [43, 44, 46, 255];
+                //         color = color.map(x => x / 255);
+                //         material.pbrMetallicRoughness.setMetallicFactor(0.1); // 完全金属
+                //         material.pbrMetallicRoughness.setRoughnessFactor(0.7); // 低粗糙度
+                //         material.pbrMetallicRoughness.setBaseColorFactor(color);  // CornflowerBlue in RGB
+                //     });
+                // }
+                // modelViewer.addEventListener('load', (event) => {
+                //     const [material] = modelViewer.model.materials;
+                //     let color = [43, 44, 46, 255];
+                //     color = color.map(x => x / 255);
+                //     material.pbrMetallicRoughness.setMetallicFactor(0.1); // 完全金属
+                //     material.pbrMetallicRoughness.setRoughnessFactor(0.7); // 低粗糙度
+                //     material.pbrMetallicRoughness.setBaseColorFactor(color);  // CornflowerBlue in RGB
+                // });
+            });
+        });
+    </script>
+
+    <style>
+        body {
+            margin: 0;
+            font-family: Arial, sans-serif;
+        }
+
+        .centered-container {
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            border-radius: 8px;
+            border-color: #e5e7eb;
+            border-style: solid;
+            border-width: 1px;
+        }
+    </style>
+</head>
+
+<body>
+<div class="centered-container">
+    <div class="column is-mobile is-centered">
+        <model-viewer id="modelviewer" style="height: #height#px; width: #width#px;"
+                      rotation-per-second="10deg"
+                      src="#src#" disable-tap
+                      environment-image="neutral"
+                      camera-target="0m 0m 0m"
+                      camera-orbit="0deg 90deg 8m"
+                      orientation="0deg 0deg 0deg"
+                      shadow-intensity=".9"
+                      ar auto-rotate
+                      camera-controls>
+        </model-viewer>
+    </div>
+</div>
+</body>
+
+</html>
\ No newline at end of file
diff --git a/assets/modelviewer-textured-template.html b/assets/modelviewer-textured-template.html
new file mode 100644
index 0000000..5f84cae
--- /dev/null
+++ b/assets/modelviewer-textured-template.html
@@ -0,0 +1,136 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+    <!-- Import the component -->
+    <script src="https://cdn.jsdelivr.net/npm/@google/model-viewer@3.1.1/dist/model-viewer.min.js"
+            type="module"></script>
+    <style>
+        body {
+            margin: 0;
+            font-family: Arial, sans-serif;
+        }
+
+        .centered-container {
+            display: flex;
+            justify-content: center;
+            align-items: center;
+        }
+
+        .modelviewer-panel-button {
+            height: 30px;
+            margin: 4px 4px;
+            padding: 0px 14px;
+            background: white;
+            border-radius: 10px;
+            box-shadow: 0px 0px 4px rgba(0, 0, 0, 0.25);
+            font-size: 14px;
+            font-weight: 600;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            cursor: pointer;
+            transition: all 0.2s ease;
+        }
+
+        .modelviewer-panel-button.checked {
+            background: #6567C9;
+            color: white;
+        }
+
+        .modelviewer-panel-button:hover {
+            background-color: #e2e6ea;
+        }
+
+        .modelviewer-panel-button-container {
+            display: flex;
+            justify-content: space-around;
+        }
+
+        .centered-container {
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+        }
+
+    </style>
+</head>
+
+<body>
+<div class="centered-container">
+    <div class="centered-container">
+        <div class="column is-mobile is-centered">
+            <model-viewer id="modelviewer" style="height: #height#px; width: #width#px;"
+                          rotation-per-second="10deg"
+                          src="#src#" disable-tap
+                          environment-image="neutral"
+                          camera-target="0m 0m 0m"
+                          camera-orbit="0deg 90deg 12m"
+                          orientation="0deg 0deg 0deg"
+                          shadow-intensity=".9"
+                          ar auto-rotate
+                          camera-controls>
+            </model-viewer>
+        </div>
+
+        <div class="modelviewer-panel-button-container">
+            <div id="appearance-button" class="modelviewer-panel-button small checked" onclick="showTexture()">
+                Appearance
+            </div>
+            <div id="geometry-button" class="modelviewer-panel-button small" onclick="hideTexture()">Geometry</div>
+        </div>
+    </div>
+</div>
+
+<script>
+    document.addEventListener('DOMContentLoaded', () => {
+        const modelViewers = document.querySelectorAll('model-viewer');
+
+        modelViewers.forEach(modelViewer => {
+            modelViewer.addEventListener('load', (event) => {
+                const [material] = modelViewer.model.materials;
+                material.pbrMetallicRoughness.setMetallicFactor(0.1);
+                material.pbrMetallicRoughness.setRoughnessFactor(0.5);
+            });
+        });
+    });
+
+    var window_state = {};
+
+    function hideTexture() {
+        let appearanceButton = document.getElementById('appearance-button');
+        let geometryButton = document.getElementById('geometry-button');
+        appearanceButton.classList.remove('checked');
+        geometryButton.classList.add('checked');
+        let modelViewer = document.getElementById('modelviewer');
+        if (modelViewer.model.materials[0].pbrMetallicRoughness.baseColorTexture.texture === null) return;
+        window_state.textures = [];
+        for (let i = 0; i < modelViewer.model.materials.length; i++) {
+            window_state.textures.push(modelViewer.model.materials[i].pbrMetallicRoughness.baseColorTexture.texture);
+        }
+        window_state.exposure = modelViewer.exposure;
+        modelViewer.environmentImage = '/static/env_maps/gradient.jpg';
+        for (let i = 0; i < modelViewer.model.materials.length; i++) {
+            modelViewer.model.materials[i].pbrMetallicRoughness.baseColorTexture.setTexture(null);
+        }
+        modelViewer.exposure = 4;
+    }
+
+    function showTexture() {
+        let appearanceButton = document.getElementById('appearance-button');
+        let geometryButton = document.getElementById('geometry-button');
+        appearanceButton.classList.add('checked');
+        geometryButton.classList.remove('checked');
+        let modelViewer = document.getElementById('modelviewer');
+        if (modelViewer.model.materials[0].pbrMetallicRoughness.baseColorTexture.texture !== null) return;
+        modelViewer.environmentImage = '/static/env_maps/white.jpg';
+        for (let i = 0; i < modelViewer.model.materials.length; i++) {
+            modelViewer.model.materials[i].pbrMetallicRoughness.baseColorTexture.setTexture(window_state.textures[i]);
+        }
+        modelViewer.exposure = window_state.exposure;
+    }
+
+</script>
+</body>
+
+</html>
\ No newline at end of file
diff --git a/assets/qrcode/discord.png b/assets/qrcode/discord.png
new file mode 100644
index 0000000..a9c326d
Binary files /dev/null and b/assets/qrcode/discord.png differ
diff --git a/assets/qrcode/wechat.png b/assets/qrcode/wechat.png
new file mode 100644
index 0000000..4f25092
Binary files /dev/null and b/assets/qrcode/wechat.png differ
diff --git a/assets/qrcode/x.png b/assets/qrcode/x.png
new file mode 100644
index 0000000..e9f9044
Binary files /dev/null and b/assets/qrcode/x.png differ
diff --git a/assets/qrcode/xiaohongshu.png b/assets/qrcode/xiaohongshu.png
new file mode 100644
index 0000000..7ace644
Binary files /dev/null and b/assets/qrcode/xiaohongshu.png differ
diff --git a/blender_addon.py b/blender_addon.py
new file mode 100644
index 0000000..149745c
--- /dev/null
+++ b/blender_addon.py
@@ -0,0 +1,347 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+bl_info = {
+    "name": "Hunyuan3D-2 Generator",
+    "author": "Tencent Hunyuan3D",
+    "version": (1, 0),
+    "blender": (3, 0, 0),
+    "location": "View3D > Sidebar > Hunyuan3D-2 3D Generator",
+    "description": "Generate/Texturing 3D models from text descriptions or images",
+    "category": "3D View",
+}
+import base64
+import os
+import tempfile
+import threading
+
+import bpy
+import requests
+from bpy.props import StringProperty, BoolProperty, IntProperty, FloatProperty
+
+
+class Hunyuan3DProperties(bpy.types.PropertyGroup):
+    prompt: StringProperty(
+        name="Text Prompt",
+        description="Describe what you want to generate",
+        default=""
+    )
+    api_url: StringProperty(
+        name="API URL",
+        description="URL of the Text-to-3D API service",
+        default="http://localhost:8080"
+    )
+    is_processing: BoolProperty(
+        name="Processing",
+        default=False
+    )
+    job_id: StringProperty(
+        name="Job ID",
+        default=""
+    )
+    status_message: StringProperty(
+        name="Status Message",
+        default=""
+    )
+    # 添加图片路径属性
+    image_path: StringProperty(
+        name="Image",
+        description="Select an image to upload",
+        subtype='FILE_PATH'
+    )
+    # 修改后的 octree_resolution 属性
+    octree_resolution: IntProperty(
+        name="Octree Resolution",
+        description="Octree resolution for the 3D generation",
+        default=256,
+        min=128,
+        max=512,
+    )
+    num_inference_steps: IntProperty(
+        name="Number of Inference Steps",
+        description="Number of inference steps for the 3D generation",
+        default=20,
+        min=20,
+        max=50
+    )
+    guidance_scale: FloatProperty(
+        name="Guidance Scale",
+        description="Guidance scale for the 3D generation",
+        default=5.5,
+        min=1.0,
+        max=10.0
+    )
+    # 添加 texture 属性
+    texture: BoolProperty(
+        name="Generate Texture",
+        description="Whether to generate texture for the 3D model",
+        default=False
+    )
+
+
+class Hunyuan3DOperator(bpy.types.Operator):
+    bl_idname = "object.generate_3d"
+    bl_label = "Generate 3D Model"
+    bl_description = "Generate a 3D model from text description, an image or a selected mesh"
+
+    job_id = ''
+    prompt = ""
+    api_url = ""
+    image_path = ""
+    octree_resolution = 256
+    num_inference_steps = 20
+    guidance_scale = 5.5
+    texture = False  # 新增属性
+    selected_mesh_base64 = ""
+    selected_mesh = None  # 新增属性，用于存储选中的 mesh
+
+    thread = None
+    task_finished = False
+
+    def modal(self, context, event):
+        if event.type in {'RIGHTMOUSE', 'ESC'}:
+            return {'CANCELLED'}
+
+        if self.task_finished:
+            print("Threaded task completed")
+            self.task_finished = False
+            props = context.scene.gen_3d_props
+            props.is_processing = False
+
+        return {'PASS_THROUGH'}
+
+    def invoke(self, context, event):
+        # 启动线程
+        props = context.scene.gen_3d_props
+        self.prompt = props.prompt
+        self.api_url = props.api_url
+        self.image_path = props.image_path
+        self.octree_resolution = props.octree_resolution
+        self.num_inference_steps = props.num_inference_steps
+        self.guidance_scale = props.guidance_scale
+        self.texture = props.texture  # 获取 texture 属性的值
+
+        if self.prompt == "" and self.image_path == "":
+            self.report({'WARNING'}, "Please enter some text or select an image first.")
+            return {'FINISHED'}
+
+        # 保存选中的 mesh 对象引用
+        for obj in context.selected_objects:
+            if obj.type == 'MESH':
+                self.selected_mesh = obj
+                break
+
+        if self.selected_mesh:
+            temp_glb_file = tempfile.NamedTemporaryFile(delete=False, suffix=".glb")
+            temp_glb_file.close()
+            bpy.ops.export_scene.gltf(filepath=temp_glb_file.name, use_selection=True)
+            with open(temp_glb_file.name, "rb") as file:
+                mesh_data = file.read()
+            mesh_b64_str = base64.b64encode(mesh_data).decode()
+            os.unlink(temp_glb_file.name)
+            self.selected_mesh_base64 = mesh_b64_str
+
+        props.is_processing = True
+
+        # 将相对路径转换为相对于 Blender 文件所在目录的绝对路径
+        blend_file_dir = os.path.dirname(bpy.data.filepath)
+        self.report({'INFO'}, f"blend_file_dir {blend_file_dir}")
+        self.report({'INFO'}, f"image_path {self.image_path}")
+        if self.image_path.startswith('//'):
+            self.image_path = self.image_path[2:]
+            self.image_path = os.path.join(blend_file_dir, self.image_path)
+
+        if self.selected_mesh and self.texture:
+            props.status_message = "Texturing Selected Mesh...\n" \
+                                   "This may take several minutes depending \n on your GPU power."
+        else:
+            mesh_type = 'Textured Mesh' if self.texture else 'White Mesh'
+            prompt_type = 'Text Prompt' if self.prompt else 'Image'
+            props.status_message = f"Generating {mesh_type} with {prompt_type}...\n" \
+                                   "This may take several minutes depending \n on your GPU power."
+
+        self.thread = threading.Thread(target=self.generate_model)
+        self.thread.start()
+
+        wm = context.window_manager
+        wm.modal_handler_add(self)
+        return {'RUNNING_MODAL'}
+
+    def generate_model(self):
+        self.report({'INFO'}, f"Generation Start")
+        base_url = self.api_url.rstrip('/')
+
+        try:
+            if self.selected_mesh_base64 and self.texture:
+                # Texturing the selected mesh
+                if self.image_path and os.path.exists(self.image_path):
+                    self.report({'INFO'}, f"Post Texturing with Image")
+                    # 打开图片文件并以二进制模式读取
+                    with open(self.image_path, "rb") as file:
+                        # 读取文件内容
+                        image_data = file.read()
+                    # 对图片数据进行 Base64 编码
+                    img_b64_str = base64.b64encode(image_data).decode()
+                    response = requests.post(
+                        f"{base_url}/generate",
+                        json={
+                            "mesh": self.selected_mesh_base64,
+                            "image": img_b64_str,
+                            "octree_resolution": self.octree_resolution,
+                            "num_inference_steps": self.num_inference_steps,
+                            "guidance_scale": self.guidance_scale,
+                            "texture": self.texture  # 传递 texture 参数
+                        },
+                    )
+                else:
+                    self.report({'INFO'}, f"Post Texturing with Text")
+                    response = requests.post(
+                        f"{base_url}/generate",
+                        json={
+                            "mesh": self.selected_mesh_base64,
+                            "text": self.prompt,
+                            "octree_resolution": self.octree_resolution,
+                            "num_inference_steps": self.num_inference_steps,
+                            "guidance_scale": self.guidance_scale,
+                            "texture": self.texture  # 传递 texture 参数
+                        },
+                    )
+            else:
+                if self.image_path:
+                    if not os.path.exists(self.image_path):
+                        self.report({'ERROR'}, f"Image path does not exist {self.image_path}")
+                        raise Exception(f'Image path does not exist {self.image_path}')
+                    self.report({'INFO'}, f"Post Start Image to 3D")
+                    # 打开图片文件并以二进制模式读取
+                    with open(self.image_path, "rb") as file:
+                        # 读取文件内容
+                        image_data = file.read()
+                    # 对图片数据进行 Base64 编码
+                    img_b64_str = base64.b64encode(image_data).decode()
+                    response = requests.post(
+                        f"{base_url}/generate",
+                        json={
+                            "image": img_b64_str,
+                            "octree_resolution": self.octree_resolution,
+                            "num_inference_steps": self.num_inference_steps,
+                            "guidance_scale": self.guidance_scale,
+                            "texture": self.texture  # 传递 texture 参数
+                        },
+                    )
+                else:
+                    self.report({'INFO'}, f"Post Start Text to 3D")
+                    response = requests.post(
+                        f"{base_url}/generate",
+                        json={
+                            "text": self.prompt,
+                            "octree_resolution": self.octree_resolution,
+                            "num_inference_steps": self.num_inference_steps,
+                            "guidance_scale": self.guidance_scale,
+                            "texture": self.texture  # 传递 texture 参数
+                        },
+                    )
+            self.report({'INFO'}, f"Post Done")
+
+            if response.status_code != 200:
+                self.report({'ERROR'}, f"Generation failed: {response.text}")
+                return
+
+            # Decode base64 and save to temporary file
+            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".glb")
+            temp_file.write(response.content)
+            temp_file.close()
+
+            # Import the GLB file in the main thread
+            def import_handler():
+                bpy.ops.import_scene.gltf(filepath=temp_file.name)
+                os.unlink(temp_file.name)
+
+                # 获取新导入的 mesh
+                new_obj = bpy.context.selected_objects[0] if bpy.context.selected_objects else None
+                if new_obj and self.selected_mesh and self.texture:
+                    # 应用选中 mesh 的位置、旋转和缩放
+                    new_obj.location = self.selected_mesh.location
+                    new_obj.rotation_euler = self.selected_mesh.rotation_euler
+                    new_obj.scale = self.selected_mesh.scale
+
+                    # 隐藏原来的 mesh
+                    self.selected_mesh.hide_set(True)
+                    self.selected_mesh.hide_render = True
+
+                return None
+
+            bpy.app.timers.register(import_handler)
+
+        except Exception as e:
+            self.report({'ERROR'}, f"Error: {str(e)}")
+
+        finally:
+            self.task_finished = True
+            self.selected_mesh_base64 = ""
+
+
+class Hunyuan3DPanel(bpy.types.Panel):
+    bl_space_type = 'VIEW_3D'
+    bl_region_type = 'UI'
+    bl_category = 'Hunyuan3D-2'
+    bl_label = 'Hunyuan3D-2 3D Generator'
+
+    def draw(self, context):
+        layout = self.layout
+        props = context.scene.gen_3d_props
+
+        layout.prop(props, "api_url")
+        layout.prop(props, "prompt")
+        # 添加图片选择器
+        layout.prop(props, "image_path")
+        # 添加新属性的 UI 元素
+        layout.prop(props, "octree_resolution")
+        layout.prop(props, "num_inference_steps")
+        layout.prop(props, "guidance_scale")
+        # 添加 texture 属性的 UI 元素
+        layout.prop(props, "texture")
+
+        row = layout.row()
+        row.enabled = not props.is_processing
+        row.operator("object.generate_3d")
+
+        if props.is_processing:
+            if props.status_message:
+                for line in props.status_message.split("\n"):
+                    layout.label(text=line)
+            else:
+                layout.label("Processing...")
+
+
+classes = (
+    Hunyuan3DProperties,
+    Hunyuan3DOperator,
+    Hunyuan3DPanel,
+)
+
+
+def register():
+    for cls in classes:
+        bpy.utils.register_class(cls)
+    bpy.types.Scene.gen_3d_props = bpy.props.PointerProperty(type=Hunyuan3DProperties)
+
+
+def unregister():
+    for cls in reversed(classes):
+        bpy.utils.unregister_class(cls)
+    del bpy.types.Scene.gen_3d_props
+
+
+if __name__ == "__main__":
+    register()
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..d0c3cbf
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..061f32f
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000..bdcdd60
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,25 @@
+myst-parser
+sphinx-rtd-theme
+furo
+sphinx-copybutton
+sphinx-inline-tabs
+nbsphinx
+nbsphinx_link
+linkify-it-py
+linkify
+ipython
+
+torch
+imageio
+scikit_image
+matplotlib
+munch
+tfpnp
+cvxpy
+torchlights
+tensorboardX
+termcolor
+proximal
+opencv-python
+huggingface_hub
+torchvision
\ No newline at end of file
diff --git a/docs/source/_static/brand.png b/docs/source/_static/brand.png
new file mode 100644
index 0000000..7e7528a
Binary files /dev/null and b/docs/source/_static/brand.png differ
diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css
new file mode 100644
index 0000000..05a0d46
--- /dev/null
+++ b/docs/source/_static/css/custom.css
@@ -0,0 +1,38 @@
+
+/*.sidebar-logo {*/
+/*    display: block;*/
+/*    margin: 0;*/
+/*    max-width: 50%;*/
+/*}*/
+
+.nbsphinx-gallery {
+    display: grid;
+    grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
+    gap: 5px;
+    margin-top: 1em;
+    margin-bottom: 1em;
+}
+
+h1 {
+    font-size: 2em
+}
+
+h2 {
+    font-size: 1.3em
+}
+
+h3 {
+    font-size: 1.25em
+}
+
+h4 {
+    font-size: 1.125em
+}
+
+h5 {
+    font-size: 1.07em
+}
+
+h6 {
+    font-size: 1em
+}
\ No newline at end of file
diff --git a/docs/source/_static/favicon.ico b/docs/source/_static/favicon.ico
new file mode 100644
index 0000000..927560b
Binary files /dev/null and b/docs/source/_static/favicon.ico differ
diff --git a/docs/source/_static/image/example_deconv.png b/docs/source/_static/image/example_deconv.png
new file mode 100644
index 0000000..00d8085
Binary files /dev/null and b/docs/source/_static/image/example_deconv.png differ
diff --git a/docs/source/_static/image/optic_results.png b/docs/source/_static/image/optic_results.png
new file mode 100644
index 0000000..fb0bc49
Binary files /dev/null and b/docs/source/_static/image/optic_results.png differ
diff --git a/docs/source/_static/image/psf.png b/docs/source/_static/image/psf.png
new file mode 100644
index 0000000..2197af7
Binary files /dev/null and b/docs/source/_static/image/psf.png differ
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..5e22fb8
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,141 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath("."))
+sys.path.insert(0, os.path.abspath("../../"))
+
+# -- Project information -----------------------------------------------------
+
+project = 'Hunyuan3D-2'
+copyright = '2025, Tencent Hunyuan3D'
+author = 'Hunyuan3D Team'
+
+# The full version, including alpha/beta/rc tags
+release = '0.0.1'
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'myst_parser',
+    'nbsphinx',
+    'nbsphinx_link',
+    # "myst_nb",
+    'sphinx_copybutton',
+    # "sphinx_inline_tabs",
+    # https://sphinx-codeautolink.readthedocs.io/en/latest/examples.html
+    'sphinx.ext.autodoc',
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.extlinks",
+    'sphinx.ext.autosummary',
+    'sphinx.ext.doctest',
+    'sphinx.ext.todo',
+    'sphinx.ext.coverage',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.viewcode',
+    'sphinx.ext.napoleon',
+]
+
+# -- Options for extlinks ----------------------------------------------------
+#
+
+extlinks = {
+    "pypi": ("https://pypi.org/project/%s/", "%s"),
+}
+
+# -- Options for intersphinx -------------------------------------------------
+#
+
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3", None),
+    "sphinx": ("https://www.sphinx-doc.org/en/master", None),
+    'torch': ('https://pytorch.org/docs/master/', None)
+}
+
+napoleon_preprocess_types = True
+
+myst_enable_extensions = [
+    "amsmath",
+    "colon_fence",
+    "deflist",
+    "dollarmath",
+    "fieldlist",
+    "html_admonition",
+    "html_image",
+    "linkify",
+    "replacements",
+    "smartquotes",
+    "strikethrough",
+    "substitution",
+    "tasklist",
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+# html_theme = 'alabaster'
+# html_theme = 'sphinx_rtd_theme'
+html_theme = "furo"
+html_title = "Hunyuan3D-2"
+language = "en"
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+html_theme_options = {
+    "light_css_variables": {
+        "font-stack": "Arial,Noto Sans,sans-serif",
+        "font-stack--monospace": "IBM Plex Mono,ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace",
+    },
+    "announcement": 'Release 🤗<a href="https://huggingface.co/spaces/tencent/Hunyuan3D-2mini-Turbo">Turbo Series</a> and <a href="https://github.com/Tencent/FlashVDM">FlashVDM</a>, Fast Shape Generation within 1 Second Right Now!',
+}
+
+#
+# -- Options for TODOs -------------------------------------------------------
+#
+todo_include_todos = True
+
+#
+# -- Options for Markdown files ----------------------------------------------
+#
+myst_admonition_enable = True
+myst_deflist_enable = True
+myst_heading_anchors = 3
+
+html_favicon = '_static/favicon.ico'
+
+pygments_style = "default"
+pygments_dark_style = "github-dark"
+
+html_css_files = [
+    'css/custom.css',
+]
diff --git a/examples/fast_shape_gen_multiview.py b/examples/fast_shape_gen_multiview.py
new file mode 100644
index 0000000..bd7970a
--- /dev/null
+++ b/examples/fast_shape_gen_multiview.py
@@ -0,0 +1,38 @@
+import time
+
+import torch
+from PIL import Image
+
+from hy3dgen.rembg import BackgroundRemover
+from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline
+
+images = {
+    "front": "assets/example_mv_images/1/front.png",
+    "left": "assets/example_mv_images/1/left.png",
+    "back": "assets/example_mv_images/1/back.png"
+}
+
+for key in images:
+    image = Image.open(images[key]).convert("RGBA")
+    if image.mode == 'RGB':
+        rembg = BackgroundRemover()
+        image = rembg(image)
+    images[key] = image
+
+pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
+    'tencent/Hunyuan3D-2mv',
+    subfolder='hunyuan3d-dit-v2-mv-turbo',
+    variant='fp16'
+)
+pipeline.enable_flashvdm()
+start_time = time.time()
+mesh = pipeline(
+    image=images,
+    num_inference_steps=5,
+    octree_resolution=380,
+    num_chunks=20000,
+    generator=torch.manual_seed(12345),
+    output_type='trimesh'
+)[0]
+print("--- %s seconds ---" % (time.time() - start_time))
+mesh.export(f'demo_mv3.glb')
diff --git a/examples/fast_shape_gen_with_flashvdm.py b/examples/fast_shape_gen_with_flashvdm.py
new file mode 100644
index 0000000..87f1f26
--- /dev/null
+++ b/examples/fast_shape_gen_with_flashvdm.py
@@ -0,0 +1,46 @@
+# HY3DGEN_DEBUG=1 USE_SAGEATTN=1 python3 examples/fast_shape_gen_with_flashvdm.py
+# HY3DGEN_DEBUG=1 USE_SAGEATTN=0 python3 examples/fast_shape_gen_with_flashvdm.py
+
+import os
+import time
+
+import torch
+from PIL import Image
+
+from hy3dgen.rembg import BackgroundRemover
+from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline
+
+pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
+    'tencent/Hunyuan3D-2',
+    subfolder='hunyuan3d-dit-v2-0-turbo',
+    use_safetensors=True,
+)
+pipeline.enable_flashvdm()
+# pipeline.compile()
+
+image_path = 'assets/demo.png'
+image = Image.open(image_path).convert("RGBA")
+if image.mode == 'RGB':
+    rembg = BackgroundRemover()
+    image = rembg(image)
+
+
+def run():
+    return pipeline(
+        image=image,
+        num_inference_steps=5,
+        octree_resolution=380,
+        num_chunks=200000,
+        generator=torch.manual_seed(12345),
+        output_type='trimesh'
+    )[0]
+
+
+save_dir = 'tmp/results/'
+os.makedirs(save_dir, exist_ok=True)
+
+for it in range(2):
+    start_time = time.time()
+    mesh = run()
+    print("--- %s seconds ---" % (time.time() - start_time))
+    mesh.export(f'{save_dir}/run_{it}.glb')
diff --git a/examples/fast_texture_gen_multiview.py b/examples/fast_texture_gen_multiview.py
new file mode 100644
index 0000000..f333f8c
--- /dev/null
+++ b/examples/fast_texture_gen_multiview.py
@@ -0,0 +1,32 @@
+import time
+
+import torch
+from PIL import Image
+import trimesh
+
+from hy3dgen.rembg import BackgroundRemover
+from hy3dgen.texgen import Hunyuan3DPaintPipeline
+
+images_path = [
+    "assets/example_mv_images/1/front.png",
+    "assets/example_mv_images/1/left.png",
+    "assets/example_mv_images/1/back.png"
+]
+
+images = []
+for image_path in images_path:
+    image = Image.open(image_path)
+    if image.mode == 'RGB':
+        rembg = BackgroundRemover()
+        image = rembg(image)
+    images.append(image)
+
+pipeline = Hunyuan3DPaintPipeline.from_pretrained(
+    'tencent/Hunyuan3D-2',
+    subfolder='hunyuan3d-paint-v2-0-turbo'
+)
+
+mesh = trimesh.load('assets/1.glb')
+
+mesh = pipeline(mesh, image=images)
+mesh.export('demo_textured.glb')
\ No newline at end of file
diff --git a/examples/faster_shape_gen_with_flashvdm_mini_turbo.py b/examples/faster_shape_gen_with_flashvdm_mini_turbo.py
new file mode 100644
index 0000000..6ca8bb0
--- /dev/null
+++ b/examples/faster_shape_gen_with_flashvdm_mini_turbo.py
@@ -0,0 +1,48 @@
+# HY3DGEN_DEBUG=1 USE_SAGEATTN=1 python3 examples/faster_shape_gen_with_flashvdm_mini_turbo.py
+# HY3DGEN_DEBUG=1 USE_SAGEATTN=0 python3 examples/faster_shape_gen_with_flashvdm_mini_turbo.py
+
+import os
+import time
+
+import torch
+from PIL import Image
+
+from hy3dgen.rembg import BackgroundRemover
+from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline
+
+device = 'cuda'
+pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
+    'tencent/Hunyuan3D-2mini',
+    subfolder='hunyuan3d-dit-v2-mini-turbo',
+    use_safetensors=False,
+    device=device
+)
+pipeline.enable_flashvdm(topk_mode='merge')
+# pipeline.compile()
+
+image_path = 'assets/demo.png'
+image = Image.open(image_path).convert("RGBA")
+if image.mode == 'RGB':
+    rembg = BackgroundRemover()
+    image = rembg(image)
+
+
+def run():
+    return pipeline(
+        image=image,
+        num_inference_steps=5,
+        octree_resolution=380,
+        num_chunks=20000,
+        generator=torch.manual_seed(12345),
+        output_type='trimesh'
+    )[0]
+
+
+save_dir = 'tmp/results/'
+os.makedirs(save_dir, exist_ok=True)
+
+for it in range(2):
+    start_time = time.time()
+    mesh = run()
+    print("--- %s seconds ---" % (time.time() - start_time))
+    mesh.export(f'{save_dir}/run_{it}.glb')
diff --git a/examples/shape_gen.py b/examples/shape_gen.py
new file mode 100644
index 0000000..6ff4d84
--- /dev/null
+++ b/examples/shape_gen.py
@@ -0,0 +1,30 @@
+import time
+
+import torch
+from PIL import Image
+
+from hy3dgen.rembg import BackgroundRemover
+from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline
+
+image_path = 'assets/demo.png'
+image = Image.open(image_path).convert("RGBA")
+if image.mode == 'RGB':
+    rembg = BackgroundRemover()
+    image = rembg(image)
+
+pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
+    'tencent/Hunyuan3D-2',
+    subfolder='hunyuan3d-dit-v2-0',
+    variant='fp16'
+)
+
+start_time = time.time()
+mesh = pipeline(image=image,
+                num_inference_steps=50,
+                octree_resolution=380,
+                num_chunks=20000,
+                generator=torch.manual_seed(12345),
+                output_type='trimesh'
+                )[0]
+print("--- %s seconds ---" % (time.time() - start_time))
+mesh.export(f'demo.glb')
diff --git a/examples/shape_gen_mini.py b/examples/shape_gen_mini.py
new file mode 100644
index 0000000..17c0989
--- /dev/null
+++ b/examples/shape_gen_mini.py
@@ -0,0 +1,31 @@
+import time
+
+import torch
+from PIL import Image
+
+from hy3dgen.rembg import BackgroundRemover
+from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline
+
+image_path = 'assets/demo.png'
+image = Image.open(image_path).convert("RGBA")
+if image.mode == 'RGB':
+    rembg = BackgroundRemover()
+    image = rembg(image)
+
+pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
+    'tencent/Hunyuan3D-2mini',
+    subfolder='hunyuan3d-dit-v2-mini',
+    variant='fp16'
+)
+
+start_time = time.time()
+mesh = pipeline(
+    image=image,
+    num_inference_steps=50,
+    octree_resolution=380,
+    num_chunks=20000,
+    generator=torch.manual_seed(12345),
+    output_type='trimesh'
+)[0]
+print("--- %s seconds ---" % (time.time() - start_time))
+mesh.export(f'demo_mini.glb')
diff --git a/examples/shape_gen_multiview.py b/examples/shape_gen_multiview.py
new file mode 100644
index 0000000..ff6452d
--- /dev/null
+++ b/examples/shape_gen_multiview.py
@@ -0,0 +1,38 @@
+import time
+
+import torch
+from PIL import Image
+
+from hy3dgen.rembg import BackgroundRemover
+from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline
+
+images = {
+    "front": "assets/example_mv_images/1/front.png",
+    "left": "assets/example_mv_images/1/left.png",
+    "back": "assets/example_mv_images/1/back.png"
+}
+
+for key in images:
+    image = Image.open(images[key]).convert("RGBA")
+    if image.mode == 'RGB':
+        rembg = BackgroundRemover()
+        image = rembg(image)
+    images[key] = image
+
+pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
+    'tencent/Hunyuan3D-2mv',
+    subfolder='hunyuan3d-dit-v2-mv',
+    variant='fp16'
+)
+
+start_time = time.time()
+mesh = pipeline(
+    image=images,
+    num_inference_steps=50,
+    octree_resolution=380,
+    num_chunks=20000,
+    generator=torch.manual_seed(12345),
+    output_type='trimesh'
+)[0]
+print("--- %s seconds ---" % (time.time() - start_time))
+mesh.export(f'demo_mv.glb')
diff --git a/examples/textured_shape_gen.py b/examples/textured_shape_gen.py
new file mode 100644
index 0000000..b156c49
--- /dev/null
+++ b/examples/textured_shape_gen.py
@@ -0,0 +1,19 @@
+from PIL import Image
+
+from hy3dgen.rembg import BackgroundRemover
+from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline
+from hy3dgen.texgen import Hunyuan3DPaintPipeline
+
+model_path = 'tencent/Hunyuan3D-2'
+pipeline_shapegen = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(model_path)
+pipeline_texgen = Hunyuan3DPaintPipeline.from_pretrained(model_path)
+
+image_path = 'assets/demo.png'
+image = Image.open(image_path).convert("RGBA")
+if image.mode == 'RGB':
+    rembg = BackgroundRemover()
+    image = rembg(image)
+
+mesh = pipeline_shapegen(image=image)[0]
+mesh = pipeline_texgen(mesh, image=image)
+mesh.export('demo.glb')
diff --git a/examples/textured_shape_gen_mini.py b/examples/textured_shape_gen_mini.py
new file mode 100644
index 0000000..b4901f3
--- /dev/null
+++ b/examples/textured_shape_gen_mini.py
@@ -0,0 +1,36 @@
+import time
+
+import torch
+from PIL import Image
+
+from hy3dgen.rembg import BackgroundRemover
+from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline
+from hy3dgen.texgen import Hunyuan3DPaintPipeline
+
+image_path = 'assets/demo.png'
+image = Image.open(image_path).convert("RGBA")
+if image.mode == 'RGB':
+    rembg = BackgroundRemover()
+    image = rembg(image)
+
+pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
+    'tencent/Hunyuan3D-2mini',
+    subfolder='hunyuan3d-dit-v2-mini',
+    variant='fp16'
+)
+pipeline_texgen = Hunyuan3DPaintPipeline.from_pretrained('tencent/Hunyuan3D-2')
+
+start_time = time.time()
+mesh = pipeline(
+    image=image,
+    num_inference_steps=50,
+    octree_resolution=380,
+    num_chunks=20000,
+    generator=torch.manual_seed(12345),
+    output_type='trimesh'
+)[0]
+print("--- %s seconds ---" % (time.time() - start_time))
+mesh.export(f'demo_mini.glb')
+
+mesh = pipeline_texgen(mesh, image=image)
+mesh.export('demo_textured_mini.glb')
diff --git a/examples/textured_shape_gen_multiview.py b/examples/textured_shape_gen_multiview.py
new file mode 100644
index 0000000..8a32c5a
--- /dev/null
+++ b/examples/textured_shape_gen_multiview.py
@@ -0,0 +1,43 @@
+import time
+
+import torch
+from PIL import Image
+
+from hy3dgen.rembg import BackgroundRemover
+from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline
+from hy3dgen.texgen import Hunyuan3DPaintPipeline
+
+images = {
+    "front": "assets/example_mv_images/1/front.png",
+    "left": "assets/example_mv_images/1/left.png",
+    "back": "assets/example_mv_images/1/back.png"
+}
+
+for key in images:
+    image = Image.open(images[key]).convert("RGBA")
+    if image.mode == 'RGB':
+        rembg = BackgroundRemover()
+        image = rembg(image)
+    images[key] = image
+
+pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
+    'tencent/Hunyuan3D-2mv',
+    subfolder='hunyuan3d-dit-v2-mv',
+    variant='fp16'
+)
+pipeline_texgen = Hunyuan3DPaintPipeline.from_pretrained('tencent/Hunyuan3D-2')
+
+start_time = time.time()
+mesh = pipeline(
+    image=images,
+    num_inference_steps=50,
+    octree_resolution=380,
+    num_chunks=20000,
+    generator=torch.manual_seed(12345),
+    output_type='trimesh'
+)[0]
+print("--- %s seconds ---" % (time.time() - start_time))
+mesh.export(f'demo_white_mesh_mv.glb')
+
+mesh = pipeline_texgen(mesh, image=images["front"])
+mesh.export('demo_textured_mv.glb')
diff --git a/gradio_app.py b/gradio_app.py
new file mode 100644
index 0000000..8d81813
--- /dev/null
+++ b/gradio_app.py
@@ -0,0 +1,755 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import os
+import random
+import shutil
+import time
+from glob import glob
+from pathlib import Path
+
+import gradio as gr
+import torch
+import trimesh
+import uvicorn
+from fastapi import FastAPI
+from fastapi.staticfiles import StaticFiles
+import uuid
+
+from hy3dgen.shapegen.utils import logger
+
+MAX_SEED = int(1e7)
+
+
+def get_example_img_list():
+    print('Loading example img list ...')
+    return sorted(glob('./assets/example_images/**/*.png', recursive=True))
+
+
+def get_example_txt_list():
+    print('Loading example txt list ...')
+    txt_list = list()
+    for line in open('./assets/example_prompts.txt', encoding='utf-8'):
+        txt_list.append(line.strip())
+    return txt_list
+
+
+def get_example_mv_list():
+    print('Loading example mv list ...')
+    mv_list = list()
+    root = './assets/example_mv_images'
+    for mv_dir in os.listdir(root):
+        view_list = []
+        for view in ['front', 'back', 'left', 'right']:
+            path = os.path.join(root, mv_dir, f'{view}.png')
+            if os.path.exists(path):
+                view_list.append(path)
+            else:
+                view_list.append(None)
+        mv_list.append(view_list)
+    return mv_list
+
+
+def gen_save_folder(max_size=200):
+    os.makedirs(SAVE_DIR, exist_ok=True)
+
+    # 获取所有文件夹路径
+    dirs = [f for f in Path(SAVE_DIR).iterdir() if f.is_dir()]
+
+    # 如果文件夹数量超过 max_size，删除创建时间最久的文件夹
+    if len(dirs) >= max_size:
+        # 按创建时间排序，最久的排在前面
+        oldest_dir = min(dirs, key=lambda x: x.stat().st_ctime)
+        shutil.rmtree(oldest_dir)
+        print(f"Removed the oldest folder: {oldest_dir}")
+
+    # 生成一个新的 uuid 文件夹名称
+    new_folder = os.path.join(SAVE_DIR, str(uuid.uuid4()))
+    os.makedirs(new_folder, exist_ok=True)
+    print(f"Created new folder: {new_folder}")
+
+    return new_folder
+
+
+def export_mesh(mesh, save_folder, textured=False, type='glb'):
+    if textured:
+        path = os.path.join(save_folder, f'textured_mesh.{type}')
+    else:
+        path = os.path.join(save_folder, f'white_mesh.{type}')
+    if type not in ['glb', 'obj']:
+        mesh.export(path)
+    else:
+        mesh.export(path, include_normals=textured)
+    return path
+
+
+def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    return seed
+
+
+def build_model_viewer_html(save_folder, height=660, width=790, textured=False):
+    # Remove first folder from path to make relative path
+    if textured:
+        related_path = f"./textured_mesh.glb"
+        template_name = './assets/modelviewer-textured-template.html'
+        output_html_path = os.path.join(save_folder, f'textured_mesh.html')
+    else:
+        related_path = f"./white_mesh.glb"
+        template_name = './assets/modelviewer-template.html'
+        output_html_path = os.path.join(save_folder, f'white_mesh.html')
+    offset = 50 if textured else 10
+    with open(os.path.join(CURRENT_DIR, template_name), 'r', encoding='utf-8') as f:
+        template_html = f.read()
+
+    with open(output_html_path, 'w', encoding='utf-8') as f:
+        template_html = template_html.replace('#height#', f'{height - offset}')
+        template_html = template_html.replace('#width#', f'{width}')
+        template_html = template_html.replace('#src#', f'{related_path}/')
+        f.write(template_html)
+
+    rel_path = os.path.relpath(output_html_path, SAVE_DIR)
+    iframe_tag = f'<iframe src="/static/{rel_path}" height="{height}" width="100%" frameborder="0"></iframe>'
+    print(
+        f'Find html file {output_html_path}, {os.path.exists(output_html_path)}, relative HTML path is /static/{rel_path}')
+
+    return f"""
+        <div style='height: {height}; width: 100%;'>
+        {iframe_tag}
+        </div>
+    """
+
+
+def _gen_shape(
+    caption=None,
+    image=None,
+    mv_image_front=None,
+    mv_image_back=None,
+    mv_image_left=None,
+    mv_image_right=None,
+    steps=50,
+    guidance_scale=7.5,
+    seed=1234,
+    octree_resolution=256,
+    check_box_rembg=False,
+    num_chunks=200000,
+    randomize_seed: bool = False,
+):
+    if not MV_MODE and image is None and caption is None:
+        raise gr.Error("Please provide either a caption or an image.")
+    if MV_MODE:
+        if mv_image_front is None and mv_image_back is None and mv_image_left is None and mv_image_right is None:
+            raise gr.Error("Please provide at least one view image.")
+        image = {}
+        if mv_image_front:
+            image['front'] = mv_image_front
+        if mv_image_back:
+            image['back'] = mv_image_back
+        if mv_image_left:
+            image['left'] = mv_image_left
+        if mv_image_right:
+            image['right'] = mv_image_right
+
+    seed = int(randomize_seed_fn(seed, randomize_seed))
+
+    octree_resolution = int(octree_resolution)
+    if caption: print('prompt is', caption)
+    save_folder = gen_save_folder()
+    stats = {
+        'model': {
+            'shapegen': f'{args.model_path}/{args.subfolder}',
+            'texgen': f'{args.texgen_model_path}',
+        },
+        'params': {
+            'caption': caption,
+            'steps': steps,
+            'guidance_scale': guidance_scale,
+            'seed': seed,
+            'octree_resolution': octree_resolution,
+            'check_box_rembg': check_box_rembg,
+            'num_chunks': num_chunks,
+        }
+    }
+    time_meta = {}
+
+    if image is None:
+        start_time = time.time()
+        try:
+            image = t2i_worker(caption)
+        except Exception as e:
+            raise gr.Error(f"Text to 3D is disable. Please enable it by `python gradio_app.py --enable_t23d`.")
+        time_meta['text2image'] = time.time() - start_time
+
+    # remove disk io to make responding faster, uncomment at your will.
+    # image.save(os.path.join(save_folder, 'input.png'))
+    if MV_MODE:
+        start_time = time.time()
+        for k, v in image.items():
+            if check_box_rembg or v.mode == "RGB":
+                img = rmbg_worker(v.convert('RGB'))
+                image[k] = img
+        time_meta['remove background'] = time.time() - start_time
+    else:
+        if check_box_rembg or image.mode == "RGB":
+            start_time = time.time()
+            image = rmbg_worker(image.convert('RGB'))
+            time_meta['remove background'] = time.time() - start_time
+
+    # remove disk io to make responding faster, uncomment at your will.
+    # image.save(os.path.join(save_folder, 'rembg.png'))
+
+    # image to white model
+    start_time = time.time()
+
+    generator = torch.Generator()
+    generator = generator.manual_seed(int(seed))
+    outputs = i23d_worker(
+        image=image,
+        num_inference_steps=steps,
+        guidance_scale=guidance_scale,
+        generator=generator,
+        octree_resolution=octree_resolution,
+        num_chunks=num_chunks,
+        output_type='mesh'
+    )
+    time_meta['shape generation'] = time.time() - start_time
+    logger.info("---Shape generation takes %s seconds ---" % (time.time() - start_time))
+
+    tmp_start = time.time()
+    mesh = export_to_trimesh(outputs)[0]
+    time_meta['export to trimesh'] = time.time() - tmp_start
+
+    stats['number_of_faces'] = mesh.faces.shape[0]
+    stats['number_of_vertices'] = mesh.vertices.shape[0]
+
+    stats['time'] = time_meta
+    main_image = image if not MV_MODE else image['front']
+    return mesh, main_image, save_folder, stats, seed
+
+
+def generation_all(
+    caption=None,
+    image=None,
+    mv_image_front=None,
+    mv_image_back=None,
+    mv_image_left=None,
+    mv_image_right=None,
+    steps=50,
+    guidance_scale=7.5,
+    seed=1234,
+    octree_resolution=256,
+    check_box_rembg=False,
+    num_chunks=200000,
+    randomize_seed: bool = False,
+):
+    start_time_0 = time.time()
+    mesh, image, save_folder, stats, seed = _gen_shape(
+        caption,
+        image,
+        mv_image_front=mv_image_front,
+        mv_image_back=mv_image_back,
+        mv_image_left=mv_image_left,
+        mv_image_right=mv_image_right,
+        steps=steps,
+        guidance_scale=guidance_scale,
+        seed=seed,
+        octree_resolution=octree_resolution,
+        check_box_rembg=check_box_rembg,
+        num_chunks=num_chunks,
+        randomize_seed=randomize_seed,
+    )
+    path = export_mesh(mesh, save_folder, textured=False)
+
+    # tmp_time = time.time()
+    # mesh = floater_remove_worker(mesh)
+    # mesh = degenerate_face_remove_worker(mesh)
+    # logger.info("---Postprocessing takes %s seconds ---" % (time.time() - tmp_time))
+    # stats['time']['postprocessing'] = time.time() - tmp_time
+
+    tmp_time = time.time()
+    mesh = face_reduce_worker(mesh)
+    logger.info("---Face Reduction takes %s seconds ---" % (time.time() - tmp_time))
+    stats['time']['face reduction'] = time.time() - tmp_time
+
+    tmp_time = time.time()
+    textured_mesh = texgen_worker(mesh, image)
+    logger.info("---Texture Generation takes %s seconds ---" % (time.time() - tmp_time))
+    stats['time']['texture generation'] = time.time() - tmp_time
+    stats['time']['total'] = time.time() - start_time_0
+
+    textured_mesh.metadata['extras'] = stats
+    path_textured = export_mesh(textured_mesh, save_folder, textured=True)
+    model_viewer_html_textured = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH,
+                                                         textured=True)
+    if args.low_vram_mode:
+        torch.cuda.empty_cache()
+    return (
+        gr.update(value=path),
+        gr.update(value=path_textured),
+        model_viewer_html_textured,
+        stats,
+        seed,
+    )
+
+
+def shape_generation(
+    caption=None,
+    image=None,
+    mv_image_front=None,
+    mv_image_back=None,
+    mv_image_left=None,
+    mv_image_right=None,
+    steps=50,
+    guidance_scale=7.5,
+    seed=1234,
+    octree_resolution=256,
+    check_box_rembg=False,
+    num_chunks=200000,
+    randomize_seed: bool = False,
+):
+    start_time_0 = time.time()
+    mesh, image, save_folder, stats, seed = _gen_shape(
+        caption,
+        image,
+        mv_image_front=mv_image_front,
+        mv_image_back=mv_image_back,
+        mv_image_left=mv_image_left,
+        mv_image_right=mv_image_right,
+        steps=steps,
+        guidance_scale=guidance_scale,
+        seed=seed,
+        octree_resolution=octree_resolution,
+        check_box_rembg=check_box_rembg,
+        num_chunks=num_chunks,
+        randomize_seed=randomize_seed,
+    )
+    stats['time']['total'] = time.time() - start_time_0
+    mesh.metadata['extras'] = stats
+
+    path = export_mesh(mesh, save_folder, textured=False)
+    model_viewer_html = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH)
+    if args.low_vram_mode:
+        torch.cuda.empty_cache()
+    return (
+        gr.update(value=path),
+        model_viewer_html,
+        stats,
+        seed,
+    )
+
+
+def build_app():
+    title = 'Hunyuan3D-2: High Resolution Textured 3D Assets Generation'
+    if MV_MODE:
+        title = 'Hunyuan3D-2mv: Image to 3D Generation with 1-4 Views'
+    if 'mini' in args.subfolder:
+        title = 'Hunyuan3D-2mini: Strong 0.6B Image to Shape Generator'
+    if TURBO_MODE:
+        title = title.replace(':', '-Turbo: Fast ')
+
+    title_html = f"""
+    <div style="font-size: 2em; font-weight: bold; text-align: center; margin-bottom: 5px">
+
+    {title}
+    </div>
+    <div align="center">
+    Tencent Hunyuan3D Team
+    </div>
+    <div align="center">
+      <a href="https://github.com/tencent/Hunyuan3D-2">Github</a> &ensp; 
+      <a href="http://3d-models.hunyuan.tencent.com">Homepage</a> &ensp;
+      <a href="https://3d.hunyuan.tencent.com">Hunyuan3D Studio</a> &ensp;
+      <a href="#">Technical Report</a> &ensp;
+      <a href="https://huggingface.co/Tencent/Hunyuan3D-2"> Pretrained Models</a> &ensp;
+    </div>
+    """
+    custom_css = """
+    .app.svelte-wpkpf6.svelte-wpkpf6:not(.fill_width) {
+        max-width: 1480px;
+    }
+    .mv-image button .wrap {
+        font-size: 10px;
+    }
+
+    .mv-image .icon-wrap {
+        width: 20px;
+    }
+
+    """
+
+    with gr.Blocks(theme=gr.themes.Base(), title='Hunyuan-3D-2.0', analytics_enabled=False, css=custom_css) as demo:
+        gr.HTML(title_html)
+
+        with gr.Row():
+            with gr.Column(scale=3):
+                with gr.Tabs(selected='tab_img_prompt') as tabs_prompt:
+                    with gr.Tab('Image Prompt', id='tab_img_prompt', visible=not MV_MODE) as tab_ip:
+                        image = gr.Image(label='Image', type='pil', image_mode='RGBA', height=290)
+
+                    with gr.Tab('Text Prompt', id='tab_txt_prompt', visible=HAS_T2I and not MV_MODE) as tab_tp:
+                        caption = gr.Textbox(label='Text Prompt',
+                                             placeholder='HunyuanDiT will be used to generate image.',
+                                             info='Example: A 3D model of a cute cat, white background')
+                    with gr.Tab('MultiView Prompt', visible=MV_MODE) as tab_mv:
+                        # gr.Label('Please upload at least one front image.')
+                        with gr.Row():
+                            mv_image_front = gr.Image(label='Front', type='pil', image_mode='RGBA', height=140,
+                                                      min_width=100, elem_classes='mv-image')
+                            mv_image_back = gr.Image(label='Back', type='pil', image_mode='RGBA', height=140,
+                                                     min_width=100, elem_classes='mv-image')
+                        with gr.Row():
+                            mv_image_left = gr.Image(label='Left', type='pil', image_mode='RGBA', height=140,
+                                                     min_width=100, elem_classes='mv-image')
+                            mv_image_right = gr.Image(label='Right', type='pil', image_mode='RGBA', height=140,
+                                                      min_width=100, elem_classes='mv-image')
+
+                with gr.Row():
+                    btn = gr.Button(value='Gen Shape', variant='primary', min_width=100)
+                    btn_all = gr.Button(value='Gen Textured Shape',
+                                        variant='primary',
+                                        visible=HAS_TEXTUREGEN,
+                                        min_width=100)
+
+                with gr.Group():
+                    file_out = gr.File(label="File", visible=False)
+                    file_out2 = gr.File(label="File", visible=False)
+
+                with gr.Tabs(selected='tab_options' if TURBO_MODE else 'tab_export'):
+                    with gr.Tab("Options", id='tab_options', visible=TURBO_MODE):
+                        gen_mode = gr.Radio(label='Generation Mode',
+                                            info='Recommendation: Turbo for most cases, Fast for very complex cases, Standard seldom use.',
+                                            choices=['Turbo', 'Fast', 'Standard'], value='Turbo')
+                        decode_mode = gr.Radio(label='Decoding Mode',
+                                               info='The resolution for exporting mesh from generated vectset',
+                                               choices=['Low', 'Standard', 'High'],
+                                               value='Standard')
+                    with gr.Tab('Advanced Options', id='tab_advanced_options'):
+                        with gr.Row():
+                            check_box_rembg = gr.Checkbox(value=True, label='Remove Background', min_width=100)
+                            randomize_seed = gr.Checkbox(label="Randomize seed", value=True, min_width=100)
+                        seed = gr.Slider(
+                            label="Seed",
+                            minimum=0,
+                            maximum=MAX_SEED,
+                            step=1,
+                            value=1234,
+                            min_width=100,
+                        )
+                        with gr.Row():
+                            num_steps = gr.Slider(maximum=100,
+                                                  minimum=1,
+                                                  value=5 if 'turbo' in args.subfolder else 30,
+                                                  step=1, label='Inference Steps')
+                            octree_resolution = gr.Slider(maximum=512, minimum=16, value=256, label='Octree Resolution')
+                        with gr.Row():
+                            cfg_scale = gr.Number(value=5.0, label='Guidance Scale', min_width=100)
+                            num_chunks = gr.Slider(maximum=5000000, minimum=1000, value=8000,
+                                                   label='Number of Chunks', min_width=100)
+                    with gr.Tab("Export", id='tab_export'):
+                        with gr.Row():
+                            file_type = gr.Dropdown(label='File Type', choices=SUPPORTED_FORMATS,
+                                                    value='glb', min_width=100)
+                            reduce_face = gr.Checkbox(label='Simplify Mesh', value=False, min_width=100)
+                            export_texture = gr.Checkbox(label='Include Texture', value=False,
+                                                         visible=False, min_width=100)
+                        target_face_num = gr.Slider(maximum=1000000, minimum=100, value=10000,
+                                                    label='Target Face Number')
+                        with gr.Row():
+                            confirm_export = gr.Button(value="Transform", min_width=100)
+                            file_export = gr.DownloadButton(label="Download", variant='primary',
+                                                            interactive=False, min_width=100)
+
+            with gr.Column(scale=6):
+                with gr.Tabs(selected='gen_mesh_panel') as tabs_output:
+                    with gr.Tab('Generated Mesh', id='gen_mesh_panel'):
+                        html_gen_mesh = gr.HTML(HTML_OUTPUT_PLACEHOLDER, label='Output')
+                    with gr.Tab('Exporting Mesh', id='export_mesh_panel'):
+                        html_export_mesh = gr.HTML(HTML_OUTPUT_PLACEHOLDER, label='Output')
+                    with gr.Tab('Mesh Statistic', id='stats_panel'):
+                        stats = gr.Json({}, label='Mesh Stats')
+
+            with gr.Column(scale=3 if MV_MODE else 2):
+                with gr.Tabs(selected='tab_img_gallery') as gallery:
+                    with gr.Tab('Image to 3D Gallery', id='tab_img_gallery', visible=not MV_MODE) as tab_gi:
+                        with gr.Row():
+                            gr.Examples(examples=example_is, inputs=[image],
+                                        label=None, examples_per_page=18)
+
+                    with gr.Tab('Text to 3D Gallery', id='tab_txt_gallery', visible=HAS_T2I and not MV_MODE) as tab_gt:
+                        with gr.Row():
+                            gr.Examples(examples=example_ts, inputs=[caption],
+                                        label=None, examples_per_page=18)
+                    with gr.Tab('MultiView to 3D Gallery', id='tab_mv_gallery', visible=MV_MODE) as tab_mv:
+                        with gr.Row():
+                            gr.Examples(examples=example_mvs,
+                                        inputs=[mv_image_front, mv_image_back, mv_image_left, mv_image_right],
+                                        label=None, examples_per_page=6)
+
+        gr.HTML(f"""
+        <div align="center">
+        Activated Model - Shape Generation ({args.model_path}/{args.subfolder}) ; Texture Generation ({'Hunyuan3D-2' if HAS_TEXTUREGEN else 'Unavailable'})
+        </div>
+        """)
+        if not HAS_TEXTUREGEN:
+            gr.HTML("""
+            <div style="margin-top: 5px;"  align="center">
+                <b>Warning: </b>
+                Texture synthesis is disable due to missing requirements,
+                 please install requirements following <a href="https://github.com/Tencent/Hunyuan3D-2?tab=readme-ov-file#install-requirements">README.md</a>to activate it.
+            </div>
+            """)
+        if not args.enable_t23d:
+            gr.HTML("""
+            <div style="margin-top: 5px;"  align="center">
+                <b>Warning: </b>
+                Text to 3D is disable. To activate it, please run `python gradio_app.py --enable_t23d`.
+            </div>
+            """)
+
+        tab_ip.select(fn=lambda: gr.update(selected='tab_img_gallery'), outputs=gallery)
+        if HAS_T2I:
+            tab_tp.select(fn=lambda: gr.update(selected='tab_txt_gallery'), outputs=gallery)
+
+        btn.click(
+            shape_generation,
+            inputs=[
+                caption,
+                image,
+                mv_image_front,
+                mv_image_back,
+                mv_image_left,
+                mv_image_right,
+                num_steps,
+                cfg_scale,
+                seed,
+                octree_resolution,
+                check_box_rembg,
+                num_chunks,
+                randomize_seed,
+            ],
+            outputs=[file_out, html_gen_mesh, stats, seed]
+        ).then(
+            lambda: (gr.update(visible=False, value=False), gr.update(interactive=True), gr.update(interactive=True),
+                     gr.update(interactive=False)),
+            outputs=[export_texture, reduce_face, confirm_export, file_export],
+        ).then(
+            lambda: gr.update(selected='gen_mesh_panel'),
+            outputs=[tabs_output],
+        )
+
+        btn_all.click(
+            generation_all,
+            inputs=[
+                caption,
+                image,
+                mv_image_front,
+                mv_image_back,
+                mv_image_left,
+                mv_image_right,
+                num_steps,
+                cfg_scale,
+                seed,
+                octree_resolution,
+                check_box_rembg,
+                num_chunks,
+                randomize_seed,
+            ],
+            outputs=[file_out, file_out2, html_gen_mesh, stats, seed]
+        ).then(
+            lambda: (gr.update(visible=True, value=True), gr.update(interactive=False), gr.update(interactive=True),
+                     gr.update(interactive=False)),
+            outputs=[export_texture, reduce_face, confirm_export, file_export],
+        ).then(
+            lambda: gr.update(selected='gen_mesh_panel'),
+            outputs=[tabs_output],
+        )
+
+        def on_gen_mode_change(value):
+            if value == 'Turbo':
+                return gr.update(value=5)
+            elif value == 'Fast':
+                return gr.update(value=10)
+            else:
+                return gr.update(value=30)
+
+        gen_mode.change(on_gen_mode_change, inputs=[gen_mode], outputs=[num_steps])
+
+        def on_decode_mode_change(value):
+            if value == 'Low':
+                return gr.update(value=196)
+            elif value == 'Standard':
+                return gr.update(value=256)
+            else:
+                return gr.update(value=384)
+
+        decode_mode.change(on_decode_mode_change, inputs=[decode_mode], outputs=[octree_resolution])
+
+        def on_export_click(file_out, file_out2, file_type, reduce_face, export_texture, target_face_num):
+            if file_out is None:
+                raise gr.Error('Please generate a mesh first.')
+
+            print(f'exporting {file_out}')
+            print(f'reduce face to {target_face_num}')
+            if export_texture:
+                mesh = trimesh.load(file_out2)
+                save_folder = gen_save_folder()
+                path = export_mesh(mesh, save_folder, textured=True, type=file_type)
+
+                # for preview
+                save_folder = gen_save_folder()
+                _ = export_mesh(mesh, save_folder, textured=True)
+                model_viewer_html = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH,
+                                                            textured=True)
+            else:
+                mesh = trimesh.load(file_out)
+                mesh = floater_remove_worker(mesh)
+                mesh = degenerate_face_remove_worker(mesh)
+                if reduce_face:
+                    mesh = face_reduce_worker(mesh, target_face_num)
+                save_folder = gen_save_folder()
+                path = export_mesh(mesh, save_folder, textured=False, type=file_type)
+
+                # for preview
+                save_folder = gen_save_folder()
+                _ = export_mesh(mesh, save_folder, textured=False)
+                model_viewer_html = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH,
+                                                            textured=False)
+            print(f'export to {path}')
+            return model_viewer_html, gr.update(value=path, interactive=True)
+
+        confirm_export.click(
+            lambda: gr.update(selected='export_mesh_panel'),
+            outputs=[tabs_output],
+        ).then(
+            on_export_click,
+            inputs=[file_out, file_out2, file_type, reduce_face, export_texture, target_face_num],
+            outputs=[html_export_mesh, file_export]
+        )
+
+    return demo
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, default='tencent/Hunyuan3D-2mini')
+    parser.add_argument("--subfolder", type=str, default='hunyuan3d-dit-v2-mini-turbo')
+    parser.add_argument("--texgen_model_path", type=str, default='tencent/Hunyuan3D-2')
+    parser.add_argument('--port', type=int, default=8080)
+    parser.add_argument('--host', type=str, default='0.0.0.0')
+    parser.add_argument('--device', type=str, default='cuda')
+    parser.add_argument('--mc_algo', type=str, default='mc')
+    parser.add_argument('--cache-path', type=str, default='gradio_cache')
+    parser.add_argument('--enable_t23d', action='store_true')
+    parser.add_argument('--disable_tex', action='store_true')
+    parser.add_argument('--enable_flashvdm', action='store_true')
+    parser.add_argument('--compile', action='store_true')
+    parser.add_argument('--low_vram_mode', action='store_true')
+    args = parser.parse_args()
+
+    SAVE_DIR = args.cache_path
+    os.makedirs(SAVE_DIR, exist_ok=True)
+
+    CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+    MV_MODE = 'mv' in args.model_path
+    TURBO_MODE = 'turbo' in args.subfolder
+
+    HTML_HEIGHT = 690 if MV_MODE else 650
+    HTML_WIDTH = 500
+    HTML_OUTPUT_PLACEHOLDER = f"""
+    <div style='height: {650}px; width: 100%; border-radius: 8px; border-color: #e5e7eb; border-style: solid; border-width: 1px; display: flex; justify-content: center; align-items: center;'>
+      <div style='text-align: center; font-size: 16px; color: #6b7280;'>
+        <p style="color: #8d8d8d;">Welcome to Hunyuan3D!</p>
+        <p style="color: #8d8d8d;">No mesh here.</p>
+      </div>
+    </div>
+    """
+
+    INPUT_MESH_HTML = """
+    <div style='height: 490px; width: 100%; border-radius: 8px; 
+    border-color: #e5e7eb; order-style: solid; border-width: 1px;'>
+    </div>
+    """
+    example_is = get_example_img_list()
+    example_ts = get_example_txt_list()
+    example_mvs = get_example_mv_list()
+
+    SUPPORTED_FORMATS = ['glb', 'obj', 'ply', 'stl']
+
+    HAS_TEXTUREGEN = False
+    if not args.disable_tex:
+        try:
+            from hy3dgen.texgen import Hunyuan3DPaintPipeline
+
+            texgen_worker = Hunyuan3DPaintPipeline.from_pretrained(args.texgen_model_path)
+            if args.low_vram_mode:
+                texgen_worker.enable_model_cpu_offload()
+            # Not help much, ignore for now.
+            # if args.compile:
+            #     texgen_worker.models['delight_model'].pipeline.unet.compile()
+            #     texgen_worker.models['delight_model'].pipeline.vae.compile()
+            #     texgen_worker.models['multiview_model'].pipeline.unet.compile()
+            #     texgen_worker.models['multiview_model'].pipeline.vae.compile()
+            HAS_TEXTUREGEN = True
+        except Exception as e:
+            print(e)
+            print("Failed to load texture generator.")
+            print('Please try to install requirements by following README.md')
+            HAS_TEXTUREGEN = False
+
+    HAS_T2I = True
+    if args.enable_t23d:
+        from hy3dgen.text2image import HunyuanDiTPipeline
+
+        t2i_worker = HunyuanDiTPipeline('Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled', device=args.device)
+        HAS_T2I = True
+
+    from hy3dgen.shapegen import FaceReducer, FloaterRemover, DegenerateFaceRemover, MeshSimplifier, \
+        Hunyuan3DDiTFlowMatchingPipeline
+    from hy3dgen.shapegen.pipelines import export_to_trimesh
+    from hy3dgen.rembg import BackgroundRemover
+
+    rmbg_worker = BackgroundRemover()
+    i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
+        args.model_path,
+        subfolder=args.subfolder,
+        use_safetensors=True,
+        device=args.device,
+    )
+    if args.enable_flashvdm:
+        mc_algo = 'mc' if args.device in ['cpu', 'mps'] else args.mc_algo
+        i23d_worker.enable_flashvdm(mc_algo=mc_algo)
+    if args.compile:
+        i23d_worker.compile()
+
+    floater_remove_worker = FloaterRemover()
+    degenerate_face_remove_worker = DegenerateFaceRemover()
+    face_reduce_worker = FaceReducer()
+
+    # https://discuss.huggingface.co/t/how-to-serve-an-html-file/33921/2
+    # create a FastAPI app
+    app = FastAPI()
+    # create a static directory to store the static files
+    static_dir = Path(SAVE_DIR).absolute()
+    static_dir.mkdir(parents=True, exist_ok=True)
+    app.mount("/static", StaticFiles(directory=static_dir, html=True), name="static")
+    shutil.copytree('./assets/env_maps', os.path.join(static_dir, 'env_maps'), dirs_exist_ok=True)
+
+    if args.low_vram_mode:
+        torch.cuda.empty_cache()
+    demo = build_app()
+    app = gr.mount_gradio_app(app, demo, path="/")
+    uvicorn.run(app, host=args.host, port=args.port, workers=1)
diff --git a/hy3dgen/__init__.py b/hy3dgen/__init__.py
new file mode 100644
index 0000000..8bb2bf8
--- /dev/null
+++ b/hy3dgen/__init__.py
@@ -0,0 +1,13 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
\ No newline at end of file
diff --git a/hy3dgen/rembg.py b/hy3dgen/rembg.py
new file mode 100644
index 0000000..6247f06
--- /dev/null
+++ b/hy3dgen/rembg.py
@@ -0,0 +1,25 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+from PIL import Image
+from rembg import remove, new_session
+
+
+class BackgroundRemover():
+    def __init__(self):
+        self.session = new_session()
+
+    def __call__(self, image: Image.Image):
+        output = remove(image, session=self.session, bgcolor=[255, 255, 255, 0])
+        return output
diff --git a/hy3dgen/shapegen/__init__.py b/hy3dgen/shapegen/__init__.py
new file mode 100644
index 0000000..1b1f9cc
--- /dev/null
+++ b/hy3dgen/shapegen/__init__.py
@@ -0,0 +1,17 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
+from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover, MeshSimplifier
+from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR
diff --git a/hy3dgen/shapegen/models/__init__.py b/hy3dgen/shapegen/models/__init__.py
new file mode 100644
index 0000000..8179353
--- /dev/null
+++ b/hy3dgen/shapegen/models/__init__.py
@@ -0,0 +1,28 @@
+# Open Source Model Licensed under the Apache License Version 2.0
+# and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited
+# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+
+from .autoencoders import ShapeVAE
+from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
+from .denoisers import Hunyuan3DDiT
diff --git a/hy3dgen/shapegen/models/autoencoders/__init__.py b/hy3dgen/shapegen/models/autoencoders/__init__.py
new file mode 100644
index 0000000..20bbf8d
--- /dev/null
+++ b/hy3dgen/shapegen/models/autoencoders/__init__.py
@@ -0,0 +1,20 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+from .attention_blocks import CrossAttentionDecoder
+from .attention_processors import FlashVDMCrossAttentionProcessor, CrossAttentionProcessor, \
+    FlashVDMTopMCrossAttentionProcessor
+from .model import ShapeVAE, VectsetVAE
+from .surface_extractors import SurfaceExtractors, MCSurfaceExtractor, DMCSurfaceExtractor, Latent2MeshOutput
+from .volume_decoders import HierarchicalVolumeDecoding, FlashVDMVolumeDecoding, VanillaVolumeDecoder
diff --git a/hy3dgen/shapegen/models/autoencoders/attention_blocks.py b/hy3dgen/shapegen/models/autoencoders/attention_blocks.py
new file mode 100644
index 0000000..ab34eeb
--- /dev/null
+++ b/hy3dgen/shapegen/models/autoencoders/attention_blocks.py
@@ -0,0 +1,493 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+
+import os
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+
+from .attention_processors import CrossAttentionProcessor
+from ...utils import logger
+
+scaled_dot_product_attention = nn.functional.scaled_dot_product_attention
+
+if os.environ.get('USE_SAGEATTN', '0') == '1':
+    try:
+        from sageattention import sageattn
+    except ImportError:
+        raise ImportError('Please install the package "sageattention" to use this USE_SAGEATTN.')
+    scaled_dot_product_attention = sageattn
+
+
+class FourierEmbedder(nn.Module):
+    """The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts
+    each feature dimension of `x[..., i]` into:
+        [
+            sin(x[..., i]),
+            sin(f_1*x[..., i]),
+            sin(f_2*x[..., i]),
+            ...
+            sin(f_N * x[..., i]),
+            cos(x[..., i]),
+            cos(f_1*x[..., i]),
+            cos(f_2*x[..., i]),
+            ...
+            cos(f_N * x[..., i]),
+            x[..., i]     # only present if include_input is True.
+        ], here f_i is the frequency.
+
+    Denote the space is [0 / num_freqs, 1 / num_freqs, 2 / num_freqs, 3 / num_freqs, ..., (num_freqs - 1) / num_freqs].
+    If logspace is True, then the frequency f_i is [2^(0 / num_freqs), ..., 2^(i / num_freqs), ...];
+    Otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)].
+
+    Args:
+        num_freqs (int): the number of frequencies, default is 6;
+        logspace (bool): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
+            otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)];
+        input_dim (int): the input dimension, default is 3;
+        include_input (bool): include the input tensor or not, default is True.
+
+    Attributes:
+        frequencies (torch.Tensor): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
+                otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1);
+
+        out_dim (int): the embedding size, if include_input is True, it is input_dim * (num_freqs * 2 + 1),
+            otherwise, it is input_dim * num_freqs * 2.
+
+    """
+
+    def __init__(self,
+                 num_freqs: int = 6,
+                 logspace: bool = True,
+                 input_dim: int = 3,
+                 include_input: bool = True,
+                 include_pi: bool = True) -> None:
+
+        """The initialization"""
+
+        super().__init__()
+
+        if logspace:
+            frequencies = 2.0 ** torch.arange(
+                num_freqs,
+                dtype=torch.float32
+            )
+        else:
+            frequencies = torch.linspace(
+                1.0,
+                2.0 ** (num_freqs - 1),
+                num_freqs,
+                dtype=torch.float32
+            )
+
+        if include_pi:
+            frequencies *= torch.pi
+
+        self.register_buffer("frequencies", frequencies, persistent=False)
+        self.include_input = include_input
+        self.num_freqs = num_freqs
+
+        self.out_dim = self.get_dims(input_dim)
+
+    def get_dims(self, input_dim):
+        temp = 1 if self.include_input or self.num_freqs == 0 else 0
+        out_dim = input_dim * (self.num_freqs * 2 + temp)
+
+        return out_dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """ Forward process.
+
+        Args:
+            x: tensor of shape [..., dim]
+
+        Returns:
+            embedding: an embedding of `x` of shape [..., dim * (num_freqs * 2 + temp)]
+                where temp is 1 if include_input is True and 0 otherwise.
+        """
+
+        if self.num_freqs > 0:
+            embed = (x[..., None].contiguous() * self.frequencies).view(*x.shape[:-1], -1)
+            if self.include_input:
+                return torch.cat((x, embed.sin(), embed.cos()), dim=-1)
+            else:
+                return torch.cat((embed.sin(), embed.cos()), dim=-1)
+        else:
+            return x
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+        This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+        the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+        See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+        changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+        'survival rate' as the argument.
+
+        """
+        if self.drop_prob == 0. or not self.training:
+            return x
+        keep_prob = 1 - self.drop_prob
+        shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+        random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+        if keep_prob > 0.0 and self.scale_by_keep:
+            random_tensor.div_(keep_prob)
+        return x * random_tensor
+
+    def extra_repr(self):
+        return f'drop_prob={round(self.drop_prob, 3):0.3f}'
+
+
+class MLP(nn.Module):
+    def __init__(
+        self, *,
+        width: int,
+        expand_ratio: int = 4,
+        output_width: int = None,
+        drop_path_rate: float = 0.0
+    ):
+        super().__init__()
+        self.width = width
+        self.c_fc = nn.Linear(width, width * expand_ratio)
+        self.c_proj = nn.Linear(width * expand_ratio, output_width if output_width is not None else width)
+        self.gelu = nn.GELU()
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+
+    def forward(self, x):
+        return self.drop_path(self.c_proj(self.gelu(self.c_fc(x))))
+
+
+class QKVMultiheadCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        heads: int,
+        n_data: Optional[int] = None,
+        width=None,
+        qk_norm=False,
+        norm_layer=nn.LayerNorm
+    ):
+        super().__init__()
+        self.heads = heads
+        self.n_data = n_data
+        self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
+
+        self.attn_processor = CrossAttentionProcessor()
+
+    def forward(self, q, kv):
+        _, n_ctx, _ = q.shape
+        bs, n_data, width = kv.shape
+        attn_ch = width // self.heads // 2
+        q = q.view(bs, n_ctx, self.heads, -1)
+        kv = kv.view(bs, n_data, self.heads, -1)
+        k, v = torch.split(kv, attn_ch, dim=-1)
+
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
+        out = self.attn_processor(self, q, k, v)
+        out = out.transpose(1, 2).reshape(bs, n_ctx, -1)
+        return out
+
+
+class MultiheadCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        width: int,
+        heads: int,
+        qkv_bias: bool = True,
+        n_data: Optional[int] = None,
+        data_width: Optional[int] = None,
+        norm_layer=nn.LayerNorm,
+        qk_norm: bool = False,
+        kv_cache: bool = False,
+    ):
+        super().__init__()
+        self.n_data = n_data
+        self.width = width
+        self.heads = heads
+        self.data_width = width if data_width is None else data_width
+        self.c_q = nn.Linear(width, width, bias=qkv_bias)
+        self.c_kv = nn.Linear(self.data_width, width * 2, bias=qkv_bias)
+        self.c_proj = nn.Linear(width, width)
+        self.attention = QKVMultiheadCrossAttention(
+            heads=heads,
+            n_data=n_data,
+            width=width,
+            norm_layer=norm_layer,
+            qk_norm=qk_norm
+        )
+        self.kv_cache = kv_cache
+        self.data = None
+
+    def forward(self, x, data):
+        x = self.c_q(x)
+        if self.kv_cache:
+            if self.data is None:
+                self.data = self.c_kv(data)
+                logger.info('Save kv cache,this should be called only once for one mesh')
+            data = self.data
+        else:
+            data = self.c_kv(data)
+        x = self.attention(x, data)
+        x = self.c_proj(x)
+        return x
+
+
+class ResidualCrossAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        n_data: Optional[int] = None,
+        width: int,
+        heads: int,
+        mlp_expand_ratio: int = 4,
+        data_width: Optional[int] = None,
+        qkv_bias: bool = True,
+        norm_layer=nn.LayerNorm,
+        qk_norm: bool = False
+    ):
+        super().__init__()
+
+        if data_width is None:
+            data_width = width
+
+        self.attn = MultiheadCrossAttention(
+            n_data=n_data,
+            width=width,
+            heads=heads,
+            data_width=data_width,
+            qkv_bias=qkv_bias,
+            norm_layer=norm_layer,
+            qk_norm=qk_norm
+        )
+        self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
+        self.ln_2 = norm_layer(data_width, elementwise_affine=True, eps=1e-6)
+        self.ln_3 = norm_layer(width, elementwise_affine=True, eps=1e-6)
+        self.mlp = MLP(width=width, expand_ratio=mlp_expand_ratio)
+
+    def forward(self, x: torch.Tensor, data: torch.Tensor):
+        x = x + self.attn(self.ln_1(x), self.ln_2(data))
+        x = x + self.mlp(self.ln_3(x))
+        return x
+
+
+class QKVMultiheadAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        heads: int,
+        n_ctx: int,
+        width=None,
+        qk_norm=False,
+        norm_layer=nn.LayerNorm
+    ):
+        super().__init__()
+        self.heads = heads
+        self.n_ctx = n_ctx
+        self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
+
+    def forward(self, qkv):
+        bs, n_ctx, width = qkv.shape
+        attn_ch = width // self.heads // 3
+        qkv = qkv.view(bs, n_ctx, self.heads, -1)
+        q, k, v = torch.split(qkv, attn_ch, dim=-1)
+
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+
+        q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
+        out = scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1)
+        return out
+
+
+class MultiheadAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        n_ctx: int,
+        width: int,
+        heads: int,
+        qkv_bias: bool,
+        norm_layer=nn.LayerNorm,
+        qk_norm: bool = False,
+        drop_path_rate: float = 0.0
+    ):
+        super().__init__()
+        self.n_ctx = n_ctx
+        self.width = width
+        self.heads = heads
+        self.c_qkv = nn.Linear(width, width * 3, bias=qkv_bias)
+        self.c_proj = nn.Linear(width, width)
+        self.attention = QKVMultiheadAttention(
+            heads=heads,
+            n_ctx=n_ctx,
+            width=width,
+            norm_layer=norm_layer,
+            qk_norm=qk_norm
+        )
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+
+    def forward(self, x):
+        x = self.c_qkv(x)
+        x = self.attention(x)
+        x = self.drop_path(self.c_proj(x))
+        return x
+
+
+class ResidualAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        n_ctx: int,
+        width: int,
+        heads: int,
+        qkv_bias: bool = True,
+        norm_layer=nn.LayerNorm,
+        qk_norm: bool = False,
+        drop_path_rate: float = 0.0,
+    ):
+        super().__init__()
+        self.attn = MultiheadAttention(
+            n_ctx=n_ctx,
+            width=width,
+            heads=heads,
+            qkv_bias=qkv_bias,
+            norm_layer=norm_layer,
+            qk_norm=qk_norm,
+            drop_path_rate=drop_path_rate
+        )
+        self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
+        self.mlp = MLP(width=width, drop_path_rate=drop_path_rate)
+        self.ln_2 = norm_layer(width, elementwise_affine=True, eps=1e-6)
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        *,
+        n_ctx: int,
+        width: int,
+        layers: int,
+        heads: int,
+        qkv_bias: bool = True,
+        norm_layer=nn.LayerNorm,
+        qk_norm: bool = False,
+        drop_path_rate: float = 0.0
+    ):
+        super().__init__()
+        self.n_ctx = n_ctx
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.ModuleList(
+            [
+                ResidualAttentionBlock(
+                    n_ctx=n_ctx,
+                    width=width,
+                    heads=heads,
+                    qkv_bias=qkv_bias,
+                    norm_layer=norm_layer,
+                    qk_norm=qk_norm,
+                    drop_path_rate=drop_path_rate
+                )
+                for _ in range(layers)
+            ]
+        )
+
+    def forward(self, x: torch.Tensor):
+        for block in self.resblocks:
+            x = block(x)
+        return x
+
+
+class CrossAttentionDecoder(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        num_latents: int,
+        out_channels: int,
+        fourier_embedder: FourierEmbedder,
+        width: int,
+        heads: int,
+        mlp_expand_ratio: int = 4,
+        downsample_ratio: int = 1,
+        enable_ln_post: bool = True,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        label_type: str = "binary"
+    ):
+        super().__init__()
+
+        self.enable_ln_post = enable_ln_post
+        self.fourier_embedder = fourier_embedder
+        self.downsample_ratio = downsample_ratio
+        self.query_proj = nn.Linear(self.fourier_embedder.out_dim, width)
+        if self.downsample_ratio != 1:
+            self.latents_proj = nn.Linear(width * downsample_ratio, width)
+        if self.enable_ln_post == False:
+            qk_norm = False
+        self.cross_attn_decoder = ResidualCrossAttentionBlock(
+            n_data=num_latents,
+            width=width,
+            mlp_expand_ratio=mlp_expand_ratio,
+            heads=heads,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm
+        )
+
+        if self.enable_ln_post:
+            self.ln_post = nn.LayerNorm(width)
+        self.output_proj = nn.Linear(width, out_channels)
+        self.label_type = label_type
+        self.count = 0
+
+    def set_cross_attention_processor(self, processor):
+        self.cross_attn_decoder.attn.attention.attn_processor = processor
+
+    def set_default_cross_attention_processor(self):
+        self.cross_attn_decoder.attn.attention.attn_processor = CrossAttentionProcessor
+
+    def forward(self, queries=None, query_embeddings=None, latents=None):
+        if query_embeddings is None:
+            query_embeddings = self.query_proj(self.fourier_embedder(queries).to(latents.dtype))
+        self.count += query_embeddings.shape[1]
+        if self.downsample_ratio != 1:
+            latents = self.latents_proj(latents)
+        x = self.cross_attn_decoder(query_embeddings, latents)
+        if self.enable_ln_post:
+            x = self.ln_post(x)
+        occ = self.output_proj(x)
+        return occ
diff --git a/hy3dgen/shapegen/models/autoencoders/attention_processors.py b/hy3dgen/shapegen/models/autoencoders/attention_processors.py
new file mode 100644
index 0000000..f7b232e
--- /dev/null
+++ b/hy3dgen/shapegen/models/autoencoders/attention_processors.py
@@ -0,0 +1,96 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import os
+
+import torch
+import torch.nn.functional as F
+
+scaled_dot_product_attention = F.scaled_dot_product_attention
+if os.environ.get('CA_USE_SAGEATTN', '0') == '1':
+    try:
+        from sageattention import sageattn
+    except ImportError:
+        raise ImportError('Please install the package "sageattention" to use this USE_SAGEATTN.')
+    scaled_dot_product_attention = sageattn
+
+
+class CrossAttentionProcessor:
+    def __call__(self, attn, q, k, v):
+        out = scaled_dot_product_attention(q, k, v)
+        return out
+
+
+class FlashVDMCrossAttentionProcessor:
+    def __init__(self, topk=None):
+        self.topk = topk
+
+    def __call__(self, attn, q, k, v):
+        if k.shape[-2] == 3072:
+            topk = 1024
+        elif k.shape[-2] == 512:
+            topk = 256
+        else:
+            topk = k.shape[-2] // 3
+
+        if self.topk is True:
+            q1 = q[:, :, ::100, :]
+            sim = q1 @ k.transpose(-1, -2)
+            sim = torch.mean(sim, -2)
+            topk_ind = torch.topk(sim, dim=-1, k=topk).indices.squeeze(-2).unsqueeze(-1)
+            topk_ind = topk_ind.expand(-1, -1, -1, v.shape[-1])
+            v0 = torch.gather(v, dim=-2, index=topk_ind)
+            k0 = torch.gather(k, dim=-2, index=topk_ind)
+            out = scaled_dot_product_attention(q, k0, v0)
+        elif self.topk is False:
+            out = scaled_dot_product_attention(q, k, v)
+        else:
+            idx, counts = self.topk
+            start = 0
+            outs = []
+            for grid_coord, count in zip(idx, counts):
+                end = start + count
+                q_chunk = q[:, :, start:end, :]
+                k0, v0 = self.select_topkv(q_chunk, k, v, topk)
+                out = scaled_dot_product_attention(q_chunk, k0, v0)
+                outs.append(out)
+                start += count
+            out = torch.cat(outs, dim=-2)
+        self.topk = False
+        return out
+
+    def select_topkv(self, q_chunk, k, v, topk):
+        q1 = q_chunk[:, :, ::50, :]
+        sim = q1 @ k.transpose(-1, -2)
+        sim = torch.mean(sim, -2)
+        topk_ind = torch.topk(sim, dim=-1, k=topk).indices.squeeze(-2).unsqueeze(-1)
+        topk_ind = topk_ind.expand(-1, -1, -1, v.shape[-1])
+        v0 = torch.gather(v, dim=-2, index=topk_ind)
+        k0 = torch.gather(k, dim=-2, index=topk_ind)
+        return k0, v0
+
+
+class FlashVDMTopMCrossAttentionProcessor(FlashVDMCrossAttentionProcessor):
+    def select_topkv(self, q_chunk, k, v, topk):
+        q1 = q_chunk[:, :, ::30, :]
+        sim = q1 @ k.transpose(-1, -2)
+        # sim = sim.to(torch.float32)
+        sim = sim.softmax(-1)
+        sim = torch.mean(sim, 1)
+        activated_token = torch.where(sim > 1e-6)[2]
+        index = torch.unique(activated_token, return_counts=True)[0].unsqueeze(0).unsqueeze(0).unsqueeze(-1)
+        index = index.expand(-1, v.shape[1], -1, v.shape[-1])
+        v0 = torch.gather(v, dim=-2, index=index)
+        k0 = torch.gather(k, dim=-2, index=index)
+        return k0, v0
diff --git a/hy3dgen/shapegen/models/autoencoders/model.py b/hy3dgen/shapegen/models/autoencoders/model.py
new file mode 100644
index 0000000..76f78da
--- /dev/null
+++ b/hy3dgen/shapegen/models/autoencoders/model.py
@@ -0,0 +1,189 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import os
+
+import torch
+import torch.nn as nn
+import yaml
+
+from .attention_blocks import FourierEmbedder, Transformer, CrossAttentionDecoder
+from .surface_extractors import MCSurfaceExtractor, SurfaceExtractors
+from .volume_decoders import VanillaVolumeDecoder, FlashVDMVolumeDecoding, HierarchicalVolumeDecoding
+from ...utils import logger, synchronize_timer, smart_load_model
+
+
+class VectsetVAE(nn.Module):
+
+    @classmethod
+    @synchronize_timer('VectsetVAE Model Loading')
+    def from_single_file(
+        cls,
+        ckpt_path,
+        config_path,
+        device='cuda',
+        dtype=torch.float16,
+        use_safetensors=None,
+        **kwargs,
+    ):
+        # load config
+        with open(config_path, 'r') as f:
+            config = yaml.safe_load(f)
+
+        # load ckpt
+        if use_safetensors:
+            ckpt_path = ckpt_path.replace('.ckpt', '.safetensors')
+        if not os.path.exists(ckpt_path):
+            raise FileNotFoundError(f"Model file {ckpt_path} not found")
+
+        logger.info(f"Loading model from {ckpt_path}")
+        if use_safetensors:
+            import safetensors.torch
+            ckpt = safetensors.torch.load_file(ckpt_path, device='cpu')
+        else:
+            ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True)
+
+        model_kwargs = config['params']
+        model_kwargs.update(kwargs)
+
+        model = cls(**model_kwargs)
+        model.load_state_dict(ckpt)
+        model.to(device=device, dtype=dtype)
+        return model
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_path,
+        device='cuda',
+        dtype=torch.float16,
+        use_safetensors=True,
+        variant='fp16',
+        subfolder='hunyuan3d-vae-v2-0',
+        **kwargs,
+    ):
+        config_path, ckpt_path = smart_load_model(
+            model_path,
+            subfolder=subfolder,
+            use_safetensors=use_safetensors,
+            variant=variant
+        )
+
+        return cls.from_single_file(
+            ckpt_path,
+            config_path,
+            device=device,
+            dtype=dtype,
+            use_safetensors=use_safetensors,
+            **kwargs
+        )
+
+    def __init__(
+        self,
+        volume_decoder=None,
+        surface_extractor=None
+    ):
+        super().__init__()
+        if volume_decoder is None:
+            volume_decoder = VanillaVolumeDecoder()
+        if surface_extractor is None:
+            surface_extractor = MCSurfaceExtractor()
+        self.volume_decoder = volume_decoder
+        self.surface_extractor = surface_extractor
+
+    def latents2mesh(self, latents: torch.FloatTensor, **kwargs):
+        with synchronize_timer('Volume decoding'):
+            grid_logits = self.volume_decoder(latents, self.geo_decoder, **kwargs)
+        with synchronize_timer('Surface extraction'):
+            outputs = self.surface_extractor(grid_logits, **kwargs)
+        return outputs
+
+    def enable_flashvdm_decoder(
+        self,
+        enabled: bool = True,
+        adaptive_kv_selection=True,
+        topk_mode='mean',
+        mc_algo='dmc',
+    ):
+        if enabled:
+            if adaptive_kv_selection:
+                self.volume_decoder = FlashVDMVolumeDecoding(topk_mode)
+            else:
+                self.volume_decoder = HierarchicalVolumeDecoding()
+            if mc_algo not in SurfaceExtractors.keys():
+                raise ValueError(f'Unsupported mc_algo {mc_algo}, available: {list(SurfaceExtractors.keys())}')
+            self.surface_extractor = SurfaceExtractors[mc_algo]()
+        else:
+            self.volume_decoder = VanillaVolumeDecoder()
+            self.surface_extractor = MCSurfaceExtractor()
+
+
+class ShapeVAE(VectsetVAE):
+    def __init__(
+        self,
+        *,
+        num_latents: int,
+        embed_dim: int,
+        width: int,
+        heads: int,
+        num_decoder_layers: int,
+        geo_decoder_downsample_ratio: int = 1,
+        geo_decoder_mlp_expand_ratio: int = 4,
+        geo_decoder_ln_post: bool = True,
+        num_freqs: int = 8,
+        include_pi: bool = True,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        label_type: str = "binary",
+        drop_path_rate: float = 0.0,
+        scale_factor: float = 1.0,
+    ):
+        super().__init__()
+        self.geo_decoder_ln_post = geo_decoder_ln_post
+
+        self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
+
+        self.post_kl = nn.Linear(embed_dim, width)
+
+        self.transformer = Transformer(
+            n_ctx=num_latents,
+            width=width,
+            layers=num_decoder_layers,
+            heads=heads,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            drop_path_rate=drop_path_rate
+        )
+
+        self.geo_decoder = CrossAttentionDecoder(
+            fourier_embedder=self.fourier_embedder,
+            out_channels=1,
+            num_latents=num_latents,
+            mlp_expand_ratio=geo_decoder_mlp_expand_ratio,
+            downsample_ratio=geo_decoder_downsample_ratio,
+            enable_ln_post=self.geo_decoder_ln_post,
+            width=width // geo_decoder_downsample_ratio,
+            heads=heads // geo_decoder_downsample_ratio,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            label_type=label_type,
+        )
+
+        self.scale_factor = scale_factor
+        self.latent_shape = (num_latents, embed_dim)
+
+    def forward(self, latents):
+        latents = self.post_kl(latents)
+        latents = self.transformer(latents)
+        return latents
diff --git a/hy3dgen/shapegen/models/autoencoders/surface_extractors.py b/hy3dgen/shapegen/models/autoencoders/surface_extractors.py
new file mode 100644
index 0000000..f4d8f63
--- /dev/null
+++ b/hy3dgen/shapegen/models/autoencoders/surface_extractors.py
@@ -0,0 +1,100 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+from typing import Union, Tuple, List
+
+import numpy as np
+import torch
+from skimage import measure
+
+
+class Latent2MeshOutput:
+
+    def __init__(self, mesh_v=None, mesh_f=None):
+        self.mesh_v = mesh_v
+        self.mesh_f = mesh_f
+
+
+def center_vertices(vertices):
+    """Translate the vertices so that bounding box is centered at zero."""
+    vert_min = vertices.min(dim=0)[0]
+    vert_max = vertices.max(dim=0)[0]
+    vert_center = 0.5 * (vert_min + vert_max)
+    return vertices - vert_center
+
+
+class SurfaceExtractor:
+    def _compute_box_stat(self, bounds: Union[Tuple[float], List[float], float], octree_resolution: int):
+        if isinstance(bounds, float):
+            bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
+
+        bbox_min, bbox_max = np.array(bounds[0:3]), np.array(bounds[3:6])
+        bbox_size = bbox_max - bbox_min
+        grid_size = [int(octree_resolution) + 1, int(octree_resolution) + 1, int(octree_resolution) + 1]
+        return grid_size, bbox_min, bbox_size
+
+    def run(self, *args, **kwargs):
+        return NotImplementedError
+
+    def __call__(self, grid_logits, **kwargs):
+        outputs = []
+        for i in range(grid_logits.shape[0]):
+            try:
+                vertices, faces = self.run(grid_logits[i], **kwargs)
+                vertices = vertices.astype(np.float32)
+                faces = np.ascontiguousarray(faces)
+                outputs.append(Latent2MeshOutput(mesh_v=vertices, mesh_f=faces))
+
+            except Exception:
+                import traceback
+                traceback.print_exc()
+                outputs.append(None)
+
+        return outputs
+
+
+class MCSurfaceExtractor(SurfaceExtractor):
+    def run(self, grid_logit, *, mc_level, bounds, octree_resolution, **kwargs):
+        vertices, faces, normals, _ = measure.marching_cubes(
+            grid_logit.cpu().numpy(),
+            mc_level,
+            method="lewiner"
+        )
+        grid_size, bbox_min, bbox_size = self._compute_box_stat(bounds, octree_resolution)
+        vertices = vertices / grid_size * bbox_size + bbox_min
+        return vertices, faces
+
+
+class DMCSurfaceExtractor(SurfaceExtractor):
+    def run(self, grid_logit, *, octree_resolution, **kwargs):
+        device = grid_logit.device
+        if not hasattr(self, 'dmc'):
+            try:
+                from diso import DiffDMC
+            except:
+                raise ImportError("Please install diso via `pip install diso`, or set mc_algo to 'mc'")
+            self.dmc = DiffDMC(dtype=torch.float32).to(device)
+        sdf = -grid_logit / octree_resolution
+        sdf = sdf.to(torch.float32).contiguous()
+        verts, faces = self.dmc(sdf, deform=None, return_quads=False, normalize=True)
+        verts = center_vertices(verts)
+        vertices = verts.detach().cpu().numpy()
+        faces = faces.detach().cpu().numpy()[:, ::-1]
+        return vertices, faces
+
+
+SurfaceExtractors = {
+    'mc': MCSurfaceExtractor,
+    'dmc': DMCSurfaceExtractor,
+}
diff --git a/hy3dgen/shapegen/models/autoencoders/volume_decoders.py b/hy3dgen/shapegen/models/autoencoders/volume_decoders.py
new file mode 100644
index 0000000..d7bfd84
--- /dev/null
+++ b/hy3dgen/shapegen/models/autoencoders/volume_decoders.py
@@ -0,0 +1,435 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+from typing import Union, Tuple, List, Callable
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import repeat
+from tqdm import tqdm
+
+from .attention_blocks import CrossAttentionDecoder
+from .attention_processors import FlashVDMCrossAttentionProcessor, FlashVDMTopMCrossAttentionProcessor
+from ...utils import logger
+
+
+def extract_near_surface_volume_fn(input_tensor: torch.Tensor, alpha: float):
+    device = input_tensor.device
+    D = input_tensor.shape[0]
+    signed_val = 0.0
+
+    # 添加偏移并处理无效值
+    val = input_tensor + alpha
+    valid_mask = val > -9000  # 假设-9000是无效值
+
+    # 改进的邻居获取函数（保持维度一致）
+    def get_neighbor(t, shift, axis):
+        """根据指定轴进行位移并保持维度一致"""
+        if shift == 0:
+            return t.clone()
+
+        # 确定填充轴（输入为[D, D, D]对应z,y,x轴）
+        pad_dims = [0, 0, 0, 0, 0, 0]  # 格式：[x前，x后，y前，y后，z前，z后]
+
+        # 根据轴类型设置填充
+        if axis == 0:  # x轴（最后一个维度）
+            pad_idx = 0 if shift > 0 else 1
+            pad_dims[pad_idx] = abs(shift)
+        elif axis == 1:  # y轴（中间维度）
+            pad_idx = 2 if shift > 0 else 3
+            pad_dims[pad_idx] = abs(shift)
+        elif axis == 2:  # z轴（第一个维度）
+            pad_idx = 4 if shift > 0 else 5
+            pad_dims[pad_idx] = abs(shift)
+
+        # 执行填充（添加batch和channel维度适配F.pad）
+        padded = F.pad(t.unsqueeze(0).unsqueeze(0), pad_dims[::-1], mode='replicate')  # 反转顺序适配F.pad
+
+        # 构建动态切片索引
+        slice_dims = [slice(None)] * 3  # 初始化为全切片
+        if axis == 0:  # x轴（dim=2）
+            if shift > 0:
+                slice_dims[0] = slice(shift, None)
+            else:
+                slice_dims[0] = slice(None, shift)
+        elif axis == 1:  # y轴（dim=1）
+            if shift > 0:
+                slice_dims[1] = slice(shift, None)
+            else:
+                slice_dims[1] = slice(None, shift)
+        elif axis == 2:  # z轴（dim=0）
+            if shift > 0:
+                slice_dims[2] = slice(shift, None)
+            else:
+                slice_dims[2] = slice(None, shift)
+
+        # 应用切片并恢复维度
+        padded = padded.squeeze(0).squeeze(0)
+        sliced = padded[slice_dims]
+        return sliced
+
+    # 获取各方向邻居（确保维度一致）
+    left = get_neighbor(val, 1, axis=0)  # x方向
+    right = get_neighbor(val, -1, axis=0)
+    back = get_neighbor(val, 1, axis=1)  # y方向
+    front = get_neighbor(val, -1, axis=1)
+    down = get_neighbor(val, 1, axis=2)  # z方向
+    up = get_neighbor(val, -1, axis=2)
+
+    # 处理边界无效值（使用where保持维度一致）
+    def safe_where(neighbor):
+        return torch.where(neighbor > -9000, neighbor, val)
+
+    left = safe_where(left)
+    right = safe_where(right)
+    back = safe_where(back)
+    front = safe_where(front)
+    down = safe_where(down)
+    up = safe_where(up)
+
+    # 计算符号一致性（转换为float32确保精度）
+    sign = torch.sign(val.to(torch.float32))
+    neighbors_sign = torch.stack([
+        torch.sign(left.to(torch.float32)),
+        torch.sign(right.to(torch.float32)),
+        torch.sign(back.to(torch.float32)),
+        torch.sign(front.to(torch.float32)),
+        torch.sign(down.to(torch.float32)),
+        torch.sign(up.to(torch.float32))
+    ], dim=0)
+
+    # 检查所有符号是否一致
+    same_sign = torch.all(neighbors_sign == sign, dim=0)
+
+    # 生成最终掩码
+    mask = (~same_sign).to(torch.int32)
+    return mask * valid_mask.to(torch.int32)
+
+
+def generate_dense_grid_points(
+    bbox_min: np.ndarray,
+    bbox_max: np.ndarray,
+    octree_resolution: int,
+    indexing: str = "ij",
+):
+    length = bbox_max - bbox_min
+    num_cells = octree_resolution
+
+    x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32)
+    y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32)
+    z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32)
+    [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing)
+    xyz = np.stack((xs, ys, zs), axis=-1)
+    grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1]
+
+    return xyz, grid_size, length
+
+
+class VanillaVolumeDecoder:
+    @torch.no_grad()
+    def __call__(
+        self,
+        latents: torch.FloatTensor,
+        geo_decoder: Callable,
+        bounds: Union[Tuple[float], List[float], float] = 1.01,
+        num_chunks: int = 10000,
+        octree_resolution: int = None,
+        enable_pbar: bool = True,
+        **kwargs,
+    ):
+        device = latents.device
+        dtype = latents.dtype
+        batch_size = latents.shape[0]
+
+        # 1. generate query points
+        if isinstance(bounds, float):
+            bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
+
+        bbox_min, bbox_max = np.array(bounds[0:3]), np.array(bounds[3:6])
+        xyz_samples, grid_size, length = generate_dense_grid_points(
+            bbox_min=bbox_min,
+            bbox_max=bbox_max,
+            octree_resolution=octree_resolution,
+            indexing="ij"
+        )
+        xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype).contiguous().reshape(-1, 3)
+
+        # 2. latents to 3d volume
+        batch_logits = []
+        for start in tqdm(range(0, xyz_samples.shape[0], num_chunks), desc=f"Volume Decoding",
+                          disable=not enable_pbar):
+            chunk_queries = xyz_samples[start: start + num_chunks, :]
+            chunk_queries = repeat(chunk_queries, "p c -> b p c", b=batch_size)
+            logits = geo_decoder(queries=chunk_queries, latents=latents)
+            batch_logits.append(logits)
+
+        grid_logits = torch.cat(batch_logits, dim=1)
+        grid_logits = grid_logits.view((batch_size, *grid_size)).float()
+
+        return grid_logits
+
+
+class HierarchicalVolumeDecoding:
+    @torch.no_grad()
+    def __call__(
+        self,
+        latents: torch.FloatTensor,
+        geo_decoder: Callable,
+        bounds: Union[Tuple[float], List[float], float] = 1.01,
+        num_chunks: int = 10000,
+        mc_level: float = 0.0,
+        octree_resolution: int = None,
+        min_resolution: int = 63,
+        enable_pbar: bool = True,
+        **kwargs,
+    ):
+        device = latents.device
+        dtype = latents.dtype
+
+        resolutions = []
+        if octree_resolution < min_resolution:
+            resolutions.append(octree_resolution)
+        while octree_resolution >= min_resolution:
+            resolutions.append(octree_resolution)
+            octree_resolution = octree_resolution // 2
+        resolutions.reverse()
+
+        # 1. generate query points
+        if isinstance(bounds, float):
+            bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
+        bbox_min = np.array(bounds[0:3])
+        bbox_max = np.array(bounds[3:6])
+        bbox_size = bbox_max - bbox_min
+
+        xyz_samples, grid_size, length = generate_dense_grid_points(
+            bbox_min=bbox_min,
+            bbox_max=bbox_max,
+            octree_resolution=resolutions[0],
+            indexing="ij"
+        )
+
+        dilate = nn.Conv3d(1, 1, 3, padding=1, bias=False, device=device, dtype=dtype)
+        dilate.weight = torch.nn.Parameter(torch.ones(dilate.weight.shape, dtype=dtype, device=device))
+
+        grid_size = np.array(grid_size)
+        xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype).contiguous().reshape(-1, 3)
+
+        # 2. latents to 3d volume
+        batch_logits = []
+        batch_size = latents.shape[0]
+        for start in tqdm(range(0, xyz_samples.shape[0], num_chunks),
+                          desc=f"Hierarchical Volume Decoding [r{resolutions[0] + 1}]"):
+            queries = xyz_samples[start: start + num_chunks, :]
+            batch_queries = repeat(queries, "p c -> b p c", b=batch_size)
+            logits = geo_decoder(queries=batch_queries, latents=latents)
+            batch_logits.append(logits)
+
+        grid_logits = torch.cat(batch_logits, dim=1).view((batch_size, grid_size[0], grid_size[1], grid_size[2]))
+
+        for octree_depth_now in resolutions[1:]:
+            grid_size = np.array([octree_depth_now + 1] * 3)
+            resolution = bbox_size / octree_depth_now
+            next_index = torch.zeros(tuple(grid_size), dtype=dtype, device=device)
+            next_logits = torch.full(next_index.shape, -10000., dtype=dtype, device=device)
+            curr_points = extract_near_surface_volume_fn(grid_logits.squeeze(0), mc_level)
+            curr_points += grid_logits.squeeze(0).abs() < 0.95
+
+            if octree_depth_now == resolutions[-1]:
+                expand_num = 0
+            else:
+                expand_num = 1
+            for i in range(expand_num):
+                curr_points = dilate(curr_points.unsqueeze(0).to(dtype)).squeeze(0)
+            (cidx_x, cidx_y, cidx_z) = torch.where(curr_points > 0)
+            next_index[cidx_x * 2, cidx_y * 2, cidx_z * 2] = 1
+            for i in range(2 - expand_num):
+                next_index = dilate(next_index.unsqueeze(0)).squeeze(0)
+            nidx = torch.where(next_index > 0)
+
+            next_points = torch.stack(nidx, dim=1)
+            next_points = (next_points * torch.tensor(resolution, dtype=next_points.dtype, device=device) +
+                           torch.tensor(bbox_min, dtype=next_points.dtype, device=device))
+            batch_logits = []
+            for start in tqdm(range(0, next_points.shape[0], num_chunks),
+                              desc=f"Hierarchical Volume Decoding [r{octree_depth_now + 1}]"):
+                queries = next_points[start: start + num_chunks, :]
+                batch_queries = repeat(queries, "p c -> b p c", b=batch_size)
+                logits = geo_decoder(queries=batch_queries.to(latents.dtype), latents=latents)
+                batch_logits.append(logits)
+            grid_logits = torch.cat(batch_logits, dim=1)
+            next_logits[nidx] = grid_logits[0, ..., 0]
+            grid_logits = next_logits.unsqueeze(0)
+        grid_logits[grid_logits == -10000.] = float('nan')
+
+        return grid_logits
+
+
+class FlashVDMVolumeDecoding:
+    def __init__(self, topk_mode='mean'):
+        if topk_mode not in ['mean', 'merge']:
+            raise ValueError(f'Unsupported topk_mode {topk_mode}, available: {["mean", "merge"]}')
+
+        if topk_mode == 'mean':
+            self.processor = FlashVDMCrossAttentionProcessor()
+        else:
+            self.processor = FlashVDMTopMCrossAttentionProcessor()
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        latents: torch.FloatTensor,
+        geo_decoder: CrossAttentionDecoder,
+        bounds: Union[Tuple[float], List[float], float] = 1.01,
+        num_chunks: int = 10000,
+        mc_level: float = 0.0,
+        octree_resolution: int = None,
+        min_resolution: int = 63,
+        mini_grid_num: int = 4,
+        enable_pbar: bool = True,
+        **kwargs,
+    ):
+        processor = self.processor
+        geo_decoder.set_cross_attention_processor(processor)
+
+        device = latents.device
+        dtype = latents.dtype
+
+        resolutions = []
+        if octree_resolution < min_resolution:
+            resolutions.append(octree_resolution)
+        while octree_resolution >= min_resolution:
+            resolutions.append(octree_resolution)
+            octree_resolution = octree_resolution // 2
+        resolutions.reverse()
+        resolutions[0] = round(resolutions[0] / mini_grid_num) * mini_grid_num - 1
+        for i, resolution in enumerate(resolutions[1:]):
+            resolutions[i + 1] = resolutions[0] * 2 ** (i + 1)
+
+        logger.info(f"FlashVDMVolumeDecoding Resolution: {resolutions}")
+
+        # 1. generate query points
+        if isinstance(bounds, float):
+            bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
+        bbox_min = np.array(bounds[0:3])
+        bbox_max = np.array(bounds[3:6])
+        bbox_size = bbox_max - bbox_min
+
+        xyz_samples, grid_size, length = generate_dense_grid_points(
+            bbox_min=bbox_min,
+            bbox_max=bbox_max,
+            octree_resolution=resolutions[0],
+            indexing="ij"
+        )
+
+        dilate = nn.Conv3d(1, 1, 3, padding=1, bias=False, device=device, dtype=dtype)
+        dilate.weight = torch.nn.Parameter(torch.ones(dilate.weight.shape, dtype=dtype, device=device))
+
+        grid_size = np.array(grid_size)
+
+        # 2. latents to 3d volume
+        xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype)
+        batch_size = latents.shape[0]
+        mini_grid_size = xyz_samples.shape[0] // mini_grid_num
+        xyz_samples = xyz_samples.view(
+            mini_grid_num, mini_grid_size,
+            mini_grid_num, mini_grid_size,
+            mini_grid_num, mini_grid_size, 3
+        ).permute(
+            0, 2, 4, 1, 3, 5, 6
+        ).reshape(
+            -1, mini_grid_size * mini_grid_size * mini_grid_size, 3
+        )
+        batch_logits = []
+        num_batchs = max(num_chunks // xyz_samples.shape[1], 1)
+        for start in tqdm(range(0, xyz_samples.shape[0], num_batchs),
+                          desc=f"FlashVDM Volume Decoding", disable=not enable_pbar):
+            queries = xyz_samples[start: start + num_batchs, :]
+            batch = queries.shape[0]
+            batch_latents = repeat(latents.squeeze(0), "p c -> b p c", b=batch)
+            processor.topk = True
+            logits = geo_decoder(queries=queries, latents=batch_latents)
+            batch_logits.append(logits)
+        grid_logits = torch.cat(batch_logits, dim=0).reshape(
+            mini_grid_num, mini_grid_num, mini_grid_num,
+            mini_grid_size, mini_grid_size,
+            mini_grid_size
+        ).permute(0, 3, 1, 4, 2, 5).contiguous().view(
+            (batch_size, grid_size[0], grid_size[1], grid_size[2])
+        )
+
+        for octree_depth_now in resolutions[1:]:
+            grid_size = np.array([octree_depth_now + 1] * 3)
+            resolution = bbox_size / octree_depth_now
+            next_index = torch.zeros(tuple(grid_size), dtype=dtype, device=device)
+            next_logits = torch.full(next_index.shape, -10000., dtype=dtype, device=device)
+            curr_points = extract_near_surface_volume_fn(grid_logits.squeeze(0), mc_level)
+            curr_points += grid_logits.squeeze(0).abs() < 0.95
+
+            if octree_depth_now == resolutions[-1]:
+                expand_num = 0
+            else:
+                expand_num = 1
+            for i in range(expand_num):
+                curr_points = dilate(curr_points.unsqueeze(0).to(dtype)).squeeze(0)
+            (cidx_x, cidx_y, cidx_z) = torch.where(curr_points > 0)
+
+            next_index[cidx_x * 2, cidx_y * 2, cidx_z * 2] = 1
+            for i in range(2 - expand_num):
+                next_index = dilate(next_index.unsqueeze(0)).squeeze(0)
+            nidx = torch.where(next_index > 0)
+
+            next_points = torch.stack(nidx, dim=1)
+            next_points = (next_points * torch.tensor(resolution, dtype=torch.float32, device=device) +
+                           torch.tensor(bbox_min, dtype=torch.float32, device=device))
+
+            query_grid_num = 6
+            min_val = next_points.min(axis=0).values
+            max_val = next_points.max(axis=0).values
+            vol_queries_index = (next_points - min_val) / (max_val - min_val) * (query_grid_num - 0.001)
+            index = torch.floor(vol_queries_index).long()
+            index = index[..., 0] * (query_grid_num ** 2) + index[..., 1] * query_grid_num + index[..., 2]
+            index = index.sort()
+            next_points = next_points[index.indices].unsqueeze(0).contiguous()
+            unique_values = torch.unique(index.values, return_counts=True)
+            grid_logits = torch.zeros((next_points.shape[1]), dtype=latents.dtype, device=latents.device)
+            input_grid = [[], []]
+            logits_grid_list = []
+            start_num = 0
+            sum_num = 0
+            for grid_index, count in zip(unique_values[0].cpu().tolist(), unique_values[1].cpu().tolist()):
+                if sum_num + count < num_chunks or sum_num == 0:
+                    sum_num += count
+                    input_grid[0].append(grid_index)
+                    input_grid[1].append(count)
+                else:
+                    processor.topk = input_grid
+                    logits_grid = geo_decoder(queries=next_points[:, start_num:start_num + sum_num], latents=latents)
+                    start_num = start_num + sum_num
+                    logits_grid_list.append(logits_grid)
+                    input_grid = [[grid_index], [count]]
+                    sum_num = count
+            if sum_num > 0:
+                processor.topk = input_grid
+                logits_grid = geo_decoder(queries=next_points[:, start_num:start_num + sum_num], latents=latents)
+                logits_grid_list.append(logits_grid)
+            logits_grid = torch.cat(logits_grid_list, dim=1)
+            grid_logits[index.indices] = logits_grid.squeeze(0).squeeze(-1)
+            next_logits[nidx] = grid_logits
+            grid_logits = next_logits.unsqueeze(0)
+
+        grid_logits[grid_logits == -10000.] = float('nan')
+
+        return grid_logits
diff --git a/hy3dgen/shapegen/models/conditioner.py b/hy3dgen/shapegen/models/conditioner.py
new file mode 100644
index 0000000..d0d848c
--- /dev/null
+++ b/hy3dgen/shapegen/models/conditioner.py
@@ -0,0 +1,257 @@
+# Open Source Model Licensed under the Apache License Version 2.0
+# and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited
+# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torchvision import transforms
+from transformers import (
+    CLIPVisionModelWithProjection,
+    CLIPVisionConfig,
+    Dinov2Model,
+    Dinov2Config,
+)
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000 ** omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    return np.concatenate([emb_sin, emb_cos], axis=1)
+
+
+class ImageEncoder(nn.Module):
+    def __init__(
+        self,
+        version=None,
+        config=None,
+        use_cls_token=True,
+        image_size=224,
+        **kwargs,
+    ):
+        super().__init__()
+
+        if config is None:
+            self.model = self.MODEL_CLASS.from_pretrained(version)
+        else:
+            self.model = self.MODEL_CLASS(self.MODEL_CONFIG_CLASS.from_dict(config))
+        self.model.eval()
+        self.model.requires_grad_(False)
+        self.use_cls_token = use_cls_token
+        self.size = image_size // 14
+        self.num_patches = (image_size // 14) ** 2
+        if self.use_cls_token:
+            self.num_patches += 1
+
+        self.transform = transforms.Compose(
+            [
+                transforms.Resize(image_size, transforms.InterpolationMode.BILINEAR, antialias=True),
+                transforms.CenterCrop(image_size),
+                transforms.Normalize(
+                    mean=self.mean,
+                    std=self.std,
+                ),
+            ]
+        )
+
+    def forward(self, image, mask=None, value_range=(-1, 1), **kwargs):
+        if value_range is not None:
+            low, high = value_range
+            image = (image - low) / (high - low)
+
+        image = image.to(self.model.device, dtype=self.model.dtype)
+        inputs = self.transform(image)
+        outputs = self.model(inputs)
+
+        last_hidden_state = outputs.last_hidden_state
+        if not self.use_cls_token:
+            last_hidden_state = last_hidden_state[:, 1:, :]
+
+        return last_hidden_state
+
+    def unconditional_embedding(self, batch_size, **kwargs):
+        device = next(self.model.parameters()).device
+        dtype = next(self.model.parameters()).dtype
+        zero = torch.zeros(
+            batch_size,
+            self.num_patches,
+            self.model.config.hidden_size,
+            device=device,
+            dtype=dtype,
+        )
+
+        return zero
+
+
+class CLIPImageEncoder(ImageEncoder):
+    MODEL_CLASS = CLIPVisionModelWithProjection
+    MODEL_CONFIG_CLASS = CLIPVisionConfig
+    mean = [0.48145466, 0.4578275, 0.40821073]
+    std = [0.26862954, 0.26130258, 0.27577711]
+
+
+class DinoImageEncoder(ImageEncoder):
+    MODEL_CLASS = Dinov2Model
+    MODEL_CONFIG_CLASS = Dinov2Config
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
+
+
+class DinoImageEncoderMV(DinoImageEncoder):
+    def __init__(
+        self,
+        version=None,
+        config=None,
+        use_cls_token=True,
+        image_size=224,
+        view_num=4,
+        **kwargs,
+    ):
+        super().__init__(version, config, use_cls_token, image_size, **kwargs)
+        self.view_num = view_num
+        self.num_patches = self.num_patches
+        pos = np.arange(self.view_num, dtype=np.float32)
+        view_embedding = torch.from_numpy(
+            get_1d_sincos_pos_embed_from_grid(self.model.config.hidden_size, pos)).float()
+
+        view_embedding = view_embedding.unsqueeze(1).repeat(1, self.num_patches, 1)
+        self.view_embed = view_embedding.unsqueeze(0)
+
+    def forward(self, image, mask=None, value_range=(-1, 1), view_idxs=None):
+        if value_range is not None:
+            low, high = value_range
+            image = (image - low) / (high - low)
+
+        image = image.to(self.model.device, dtype=self.model.dtype)
+
+        bs, num_views, c, h, w = image.shape
+        image = image.view(bs * num_views, c, h, w)
+
+        inputs = self.transform(image)
+        outputs = self.model(inputs)
+
+        last_hidden_state = outputs.last_hidden_state
+        last_hidden_state = last_hidden_state.view(
+            bs, num_views, last_hidden_state.shape[-2],
+            last_hidden_state.shape[-1]
+        )
+
+        view_embedding = self.view_embed.to(last_hidden_state.dtype).to(last_hidden_state.device)
+        if view_idxs is not None:
+            assert len(view_idxs) == bs
+            view_embeddings = []
+            for i in range(bs):
+                view_idx = view_idxs[i]
+                assert num_views == len(view_idx)
+                view_embeddings.append(self.view_embed[:, view_idx, ...])
+            view_embedding = torch.cat(view_embeddings, 0).to(last_hidden_state.dtype).to(last_hidden_state.device)
+
+        if num_views != self.view_num:
+            view_embedding = view_embedding[:, :num_views, ...]
+        last_hidden_state = last_hidden_state + view_embedding
+        last_hidden_state = last_hidden_state.view(bs, num_views * last_hidden_state.shape[-2],
+                                                   last_hidden_state.shape[-1])
+        return last_hidden_state
+
+    def unconditional_embedding(self, batch_size, view_idxs=None, **kwargs):
+        device = next(self.model.parameters()).device
+        dtype = next(self.model.parameters()).dtype
+        zero = torch.zeros(
+            batch_size,
+            self.num_patches * len(view_idxs[0]),
+            self.model.config.hidden_size,
+            device=device,
+            dtype=dtype,
+        )
+        return zero
+
+
+def build_image_encoder(config):
+    if config['type'] == 'CLIPImageEncoder':
+        return CLIPImageEncoder(**config['kwargs'])
+    elif config['type'] == 'DinoImageEncoder':
+        return DinoImageEncoder(**config['kwargs'])
+    elif config['type'] == 'DinoImageEncoderMV':
+        return DinoImageEncoderMV(**config['kwargs'])
+    else:
+        raise ValueError(f'Unknown image encoder type: {config["type"]}')
+
+
+class DualImageEncoder(nn.Module):
+    def __init__(
+        self,
+        main_image_encoder,
+        additional_image_encoder,
+    ):
+        super().__init__()
+        self.main_image_encoder = build_image_encoder(main_image_encoder)
+        self.additional_image_encoder = build_image_encoder(additional_image_encoder)
+
+    def forward(self, image, mask=None, **kwargs):
+        outputs = {
+            'main': self.main_image_encoder(image, mask=mask, **kwargs),
+            'additional': self.additional_image_encoder(image, mask=mask, **kwargs),
+        }
+        return outputs
+
+    def unconditional_embedding(self, batch_size, **kwargs):
+        outputs = {
+            'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs),
+            'additional': self.additional_image_encoder.unconditional_embedding(batch_size, **kwargs),
+        }
+        return outputs
+
+
+class SingleImageEncoder(nn.Module):
+    def __init__(
+        self,
+        main_image_encoder,
+    ):
+        super().__init__()
+        self.main_image_encoder = build_image_encoder(main_image_encoder)
+
+    def forward(self, image, mask=None, **kwargs):
+        outputs = {
+            'main': self.main_image_encoder(image, mask=mask, **kwargs),
+        }
+        return outputs
+
+    def unconditional_embedding(self, batch_size, **kwargs):
+        outputs = {
+            'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs),
+        }
+        return outputs
diff --git a/hy3dgen/shapegen/models/denoisers/__init__.py b/hy3dgen/shapegen/models/denoisers/__init__.py
new file mode 100644
index 0000000..7260933
--- /dev/null
+++ b/hy3dgen/shapegen/models/denoisers/__init__.py
@@ -0,0 +1,15 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+from .hunyuan3ddit import Hunyuan3DDiT
diff --git a/hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py b/hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py
new file mode 100644
index 0000000..7873f16
--- /dev/null
+++ b/hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py
@@ -0,0 +1,410 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import math
+import os
+from dataclasses import dataclass
+from typing import List, Tuple, Optional
+
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+
+scaled_dot_product_attention = nn.functional.scaled_dot_product_attention
+if os.environ.get('USE_SAGEATTN', '0') == '1':
+    try:
+        from sageattention import sageattn
+    except ImportError:
+        raise ImportError('Please install the package "sageattention" to use this USE_SAGEATTN.')
+    scaled_dot_product_attention = sageattn
+
+
+def attention(q: Tensor, k: Tensor, v: Tensor, **kwargs) -> Tensor:
+    x = scaled_dot_product_attention(q, k, v)
+    x = rearrange(x, "B H L D -> B L (H D)")
+    return x
+
+
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+        t.device
+    )
+
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+
+
+class GELU(nn.Module):
+    def __init__(self, approximate='tanh'):
+        super().__init__()
+        self.approximate = approximate
+
+    def forward(self, x: Tensor) -> Tensor:
+        return nn.functional.gelu(x.contiguous(), approximate=self.approximate)
+
+
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x: Tensor):
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + 1e-6)
+        return (x * rrms).to(dtype=x_dtype) * self.scale
+
+
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+
+
+class SelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe)
+        x = self.proj(x)
+        return x
+
+
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+
+
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+
+    def forward(self, vec: Tensor) -> Tuple[ModulationOut, Optional[ModulationOut]]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :]
+        out = out.chunk(self.multiplier, dim=-1)
+
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+
+
+class DoubleStreamBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float,
+        qkv_bias: bool = False,
+    ):
+        super().__init__()
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+
+        self.txt_mod = Modulation(hidden_size, double=True)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+
+    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> Tuple[Tensor, Tensor]:
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+
+        attn = attention(q, k, v, pe=pe)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]
+
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+        return img, txt
+
+
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: Optional[float] = None,
+    ):
+        super().__init__()
+
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+
+        self.norm = QKNorm(head_dim)
+
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+
+        self.mlp_act = GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+        mod, _ = self.modulation(vec)
+
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+
+        # compute attention
+        attn = attention(q, k, v, pe=pe)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        return x + mod.gate * output
+
+
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x
+
+
+class Hunyuan3DDiT(nn.Module):
+    def __init__(
+        self,
+        in_channels: int = 64,
+        context_in_dim: int = 1536,
+        hidden_size: int = 1024,
+        mlp_ratio: float = 4.0,
+        num_heads: int = 16,
+        depth: int = 16,
+        depth_single_blocks: int = 32,
+        axes_dim: List[int] = [64],
+        theta: int = 10_000,
+        qkv_bias: bool = True,
+        time_factor: float = 1000,
+        guidance_embed: bool = False,
+        ckpt_path: Optional[str] = None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.context_in_dim = context_in_dim
+        self.hidden_size = hidden_size
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.depth = depth
+        self.depth_single_blocks = depth_single_blocks
+        self.axes_dim = axes_dim
+        self.theta = theta
+        self.qkv_bias = qkv_bias
+        self.time_factor = time_factor
+        self.out_channels = self.in_channels
+        self.guidance_embed = guidance_embed
+
+        if hidden_size % num_heads != 0:
+            raise ValueError(
+                f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
+            )
+        pe_dim = hidden_size // num_heads
+        if sum(axes_dim) != pe_dim:
+            raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.latent_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.cond_in = nn.Linear(context_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if guidance_embed else nn.Identity()
+        )
+
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                )
+                for _ in range(depth)
+            ]
+        )
+
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=mlp_ratio,
+                )
+                for _ in range(depth_single_blocks)
+            ]
+        )
+
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+
+        if ckpt_path is not None:
+            print('restored denoiser ckpt', ckpt_path)
+
+            ckpt = torch.load(ckpt_path, map_location="cpu")
+            if 'state_dict' not in ckpt:
+                # deepspeed ckpt
+                state_dict = {}
+                for k in ckpt.keys():
+                    new_k = k.replace('_forward_module.', '')
+                    state_dict[new_k] = ckpt[k]
+            else:
+                state_dict = ckpt["state_dict"]
+
+            final_state_dict = {}
+            for k, v in state_dict.items():
+                if k.startswith('model.'):
+                    final_state_dict[k.replace('model.', '')] = v
+                else:
+                    final_state_dict[k] = v
+            missing, unexpected = self.load_state_dict(final_state_dict, strict=False)
+            print('unexpected keys:', unexpected)
+            print('missing keys:', missing)
+
+    def forward(
+        self,
+        x,
+        t,
+        contexts,
+        **kwargs,
+    ) -> Tensor:
+        cond = contexts['main']
+        latent = self.latent_in(x)
+
+        vec = self.time_in(timestep_embedding(t, 256, self.time_factor).to(dtype=latent.dtype))
+        if self.guidance_embed:
+            guidance = kwargs.get('guidance', None)
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256, self.time_factor))
+
+        cond = self.cond_in(cond)
+        pe = None
+
+        for block in self.double_blocks:
+            latent, cond = block(img=latent, txt=cond, vec=vec, pe=pe)
+
+        latent = torch.cat((cond, latent), 1)
+        for block in self.single_blocks:
+            latent = block(latent, vec=vec, pe=pe)
+
+        latent = latent[:, cond.shape[1]:, ...]
+        latent = self.final_layer(latent, vec)
+        return latent
diff --git a/hy3dgen/shapegen/pipelines.py b/hy3dgen/shapegen/pipelines.py
new file mode 100644
index 0000000..09108a7
--- /dev/null
+++ b/hy3dgen/shapegen/pipelines.py
@@ -0,0 +1,765 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import copy
+import importlib
+import inspect
+import os
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+import trimesh
+import yaml
+from PIL import Image
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils.import_utils import is_accelerate_version, is_accelerate_available
+from tqdm import tqdm
+
+from .models.autoencoders import ShapeVAE
+from .models.autoencoders import SurfaceExtractors
+from .utils import logger, synchronize_timer, smart_load_model
+
+
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+@synchronize_timer('Export to trimesh')
+def export_to_trimesh(mesh_output):
+    if isinstance(mesh_output, list):
+        outputs = []
+        for mesh in mesh_output:
+            if mesh is None:
+                outputs.append(None)
+            else:
+                mesh.mesh_f = mesh.mesh_f[:, ::-1]
+                mesh_output = trimesh.Trimesh(mesh.mesh_v, mesh.mesh_f)
+                outputs.append(mesh_output)
+        return outputs
+    else:
+        mesh_output.mesh_f = mesh_output.mesh_f[:, ::-1]
+        mesh_output = trimesh.Trimesh(mesh_output.mesh_v, mesh_output.mesh_f)
+        return mesh_output
+
+
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+
+
+def instantiate_from_config(config, **kwargs):
+    if "target" not in config:
+        raise KeyError("Expected key `target` to instantiate.")
+    cls = get_obj_from_str(config["target"])
+    params = config.get("params", dict())
+    kwargs.update(params)
+    instance = cls(**kwargs)
+    return instance
+
+
+class Hunyuan3DDiTPipeline:
+    model_cpu_offload_seq = "conditioner->model->vae"
+    _exclude_from_cpu_offload = []
+
+    @classmethod
+    @synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
+    def from_single_file(
+        cls,
+        ckpt_path,
+        config_path,
+        device='cuda',
+        dtype=torch.float16,
+        use_safetensors=None,
+        **kwargs,
+    ):
+        # load config
+        with open(config_path, 'r') as f:
+            config = yaml.safe_load(f)
+
+        # load ckpt
+        if use_safetensors:
+            ckpt_path = ckpt_path.replace('.ckpt', '.safetensors')
+        if not os.path.exists(ckpt_path):
+            raise FileNotFoundError(f"Model file {ckpt_path} not found")
+        logger.info(f"Loading model from {ckpt_path}")
+
+        if use_safetensors:
+            # parse safetensors
+            import safetensors.torch
+            safetensors_ckpt = safetensors.torch.load_file(ckpt_path, device='cpu')
+            ckpt = {}
+            for key, value in safetensors_ckpt.items():
+                model_name = key.split('.')[0]
+                new_key = key[len(model_name) + 1:]
+                if model_name not in ckpt:
+                    ckpt[model_name] = {}
+                ckpt[model_name][new_key] = value
+        else:
+            ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True)
+        # load model
+        model = instantiate_from_config(config['model'])
+        model.load_state_dict(ckpt['model'])
+        vae = instantiate_from_config(config['vae'])
+        vae.load_state_dict(ckpt['vae'])
+        conditioner = instantiate_from_config(config['conditioner'])
+        if 'conditioner' in ckpt:
+            conditioner.load_state_dict(ckpt['conditioner'])
+        image_processor = instantiate_from_config(config['image_processor'])
+        scheduler = instantiate_from_config(config['scheduler'])
+
+        model_kwargs = dict(
+            vae=vae,
+            model=model,
+            scheduler=scheduler,
+            conditioner=conditioner,
+            image_processor=image_processor,
+            device=device,
+            dtype=dtype,
+        )
+        model_kwargs.update(kwargs)
+
+        return cls(
+            **model_kwargs
+        )
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_path,
+        device='cuda',
+        dtype=torch.float16,
+        use_safetensors=True,
+        variant='fp16',
+        subfolder='hunyuan3d-dit-v2-0',
+        **kwargs,
+    ):
+        kwargs['from_pretrained_kwargs'] = dict(
+            model_path=model_path,
+            subfolder=subfolder,
+            use_safetensors=use_safetensors,
+            variant=variant,
+            dtype=dtype,
+            device=device,
+        )
+        config_path, ckpt_path = smart_load_model(
+            model_path,
+            subfolder=subfolder,
+            use_safetensors=use_safetensors,
+            variant=variant
+        )
+        return cls.from_single_file(
+            ckpt_path,
+            config_path,
+            device=device,
+            dtype=dtype,
+            use_safetensors=use_safetensors,
+            **kwargs
+        )
+
+    def __init__(
+        self,
+        vae,
+        model,
+        scheduler,
+        conditioner,
+        image_processor,
+        device='cuda',
+        dtype=torch.float16,
+        **kwargs
+    ):
+        self.vae = vae
+        self.model = model
+        self.scheduler = scheduler
+        self.conditioner = conditioner
+        self.image_processor = image_processor
+        self.kwargs = kwargs
+        self.to(device, dtype)
+
+    def compile(self):
+        self.vae = torch.compile(self.vae)
+        self.model = torch.compile(self.model)
+        self.conditioner = torch.compile(self.conditioner)
+
+    def enable_flashvdm(
+        self,
+        enabled: bool = True,
+        adaptive_kv_selection=True,
+        topk_mode='mean',
+        mc_algo='mc',
+        replace_vae=True,
+    ):
+        if enabled:
+            model_path = self.kwargs['from_pretrained_kwargs']['model_path']
+            turbo_vae_mapping = {
+                'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
+                'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
+                'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini-turbo'),
+            }
+            model_name = model_path.split('/')[-1]
+            if replace_vae and model_name in turbo_vae_mapping:
+                model_path, subfolder = turbo_vae_mapping[model_name]
+                self.vae = ShapeVAE.from_pretrained(
+                    model_path, subfolder=subfolder,
+                    use_safetensors=self.kwargs['from_pretrained_kwargs']['use_safetensors'],
+                    device=self.device,
+                )
+            self.vae.enable_flashvdm_decoder(
+                enabled=enabled,
+                adaptive_kv_selection=adaptive_kv_selection,
+                topk_mode=topk_mode,
+                mc_algo=mc_algo
+            )
+        else:
+            model_path = self.kwargs['from_pretrained_kwargs']['model_path']
+            vae_mapping = {
+                'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
+                'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
+                'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini'),
+            }
+            model_name = model_path.split('/')[-1]
+            if model_name in vae_mapping:
+                model_path, subfolder = vae_mapping[model_name]
+                self.vae = ShapeVAE.from_pretrained(model_path, subfolder=subfolder)
+            self.vae.enable_flashvdm_decoder(enabled=False)
+
+    def to(self, device=None, dtype=None):
+        if dtype is not None:
+            self.dtype = dtype
+            self.vae.to(dtype=dtype)
+            self.model.to(dtype=dtype)
+            self.conditioner.to(dtype=dtype)
+        if device is not None:
+            self.device = torch.device(device)
+            self.vae.to(device)
+            self.model.to(device)
+            self.conditioner.to(device)
+
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        [`~DiffusionPipeline.enable_sequential_cpu_offload`] the execution device can only be inferred from
+        Accelerate's module hooks.
+        """
+        for name, model in self.components.items():
+            if not isinstance(model, torch.nn.Module) or name in self._exclude_from_cpu_offload:
+                continue
+
+            if not hasattr(model, "_hf_hook"):
+                return self.device
+            for module in model.modules():
+                if (
+                    hasattr(module, "_hf_hook")
+                    and hasattr(module._hf_hook, "execution_device")
+                    and module._hf_hook.execution_device is not None
+                ):
+                    return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+
+        Arguments:
+            gpu_id (`int`, *optional*):
+                The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
+            device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
+                The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
+                default to "cuda".
+        """
+        if self.model_cpu_offload_seq is None:
+            raise ValueError(
+                "Model CPU offload cannot be enabled because no `model_cpu_offload_seq` class attribute is set."
+            )
+
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        torch_device = torch.device(device)
+        device_index = torch_device.index
+
+        if gpu_id is not None and device_index is not None:
+            raise ValueError(
+                f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
+                f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
+            )
+
+        # _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0
+        self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0)
+
+        device_type = torch_device.type
+        device = torch.device(f"{device_type}:{self._offload_gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu")
+            device_mod = getattr(torch, self.device.type, None)
+            if hasattr(device_mod, "empty_cache") and device_mod.is_available():
+                device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
+
+        self._all_hooks = []
+        hook = None
+        for model_str in self.model_cpu_offload_seq.split("->"):
+            model = all_model_components.pop(model_str, None)
+            if not isinstance(model, torch.nn.Module):
+                continue
+
+            _, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook)
+            self._all_hooks.append(hook)
+
+        # CPU offload models that are not in the seq chain unless they are explicitly excluded
+        # these models will stay on CPU until maybe_free_model_hooks is called
+        # some models cannot be in the seq chain because they are iteratively called, such as controlnet
+        for name, model in all_model_components.items():
+            if not isinstance(model, torch.nn.Module):
+                continue
+
+            if name in self._exclude_from_cpu_offload:
+                model.to(device)
+            else:
+                _, hook = cpu_offload_with_hook(model, device)
+                self._all_hooks.append(hook)
+
+    def maybe_free_model_hooks(self):
+        r"""
+        Function that offloads all components, removes all model hooks that were added when using
+        `enable_model_cpu_offload` and then applies them again. In case the model has not been offloaded this function
+        is a no-op. Make sure to add this function to the end of the `__call__` function of your pipeline so that it
+        functions correctly when applying enable_model_cpu_offload.
+        """
+        if not hasattr(self, "_all_hooks") or len(self._all_hooks) == 0:
+            # `enable_model_cpu_offload` has not be called, so silently do nothing
+            return
+
+        for hook in self._all_hooks:
+            # offload model and remove hook from model
+            hook.offload()
+            hook.remove()
+
+        # make sure the model is in the same state as before calling it
+        self.enable_model_cpu_offload()
+
+    @synchronize_timer('Encode cond')
+    def encode_cond(self, image, additional_cond_inputs, do_classifier_free_guidance, dual_guidance):
+        bsz = image.shape[0]
+        cond = self.conditioner(image=image, **additional_cond_inputs)
+
+        if do_classifier_free_guidance:
+            un_cond = self.conditioner.unconditional_embedding(bsz, **additional_cond_inputs)
+
+            if dual_guidance:
+                un_cond_drop_main = copy.deepcopy(un_cond)
+                un_cond_drop_main['additional'] = cond['additional']
+
+                def cat_recursive(a, b, c):
+                    if isinstance(a, torch.Tensor):
+                        return torch.cat([a, b, c], dim=0).to(self.dtype)
+                    out = {}
+                    for k in a.keys():
+                        out[k] = cat_recursive(a[k], b[k], c[k])
+                    return out
+
+                cond = cat_recursive(cond, un_cond_drop_main, un_cond)
+            else:
+                def cat_recursive(a, b):
+                    if isinstance(a, torch.Tensor):
+                        return torch.cat([a, b], dim=0).to(self.dtype)
+                    out = {}
+                    for k in a.keys():
+                        out[k] = cat_recursive(a[k], b[k])
+                    return out
+
+                cond = cat_recursive(cond, un_cond)
+        return cond
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def prepare_latents(self, batch_size, dtype, device, generator, latents=None):
+        shape = (batch_size, *self.vae.latent_shape)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
+        return latents
+
+    def prepare_image(self, image) -> dict:
+        if isinstance(image, str) and not os.path.exists(image):
+            raise FileNotFoundError(f"Couldn't find image at path {image}")
+
+        if not isinstance(image, list):
+            image = [image]
+
+        outputs = []
+        for img in image:
+            output = self.image_processor(img)
+            outputs.append(output)
+
+        cond_input = {k: [] for k in outputs[0].keys()}
+        for output in outputs:
+            for key, value in output.items():
+                cond_input[key].append(value)
+        for key, value in cond_input.items():
+            if isinstance(value[0], torch.Tensor):
+                cond_input[key] = torch.cat(value, dim=0)
+
+        return cond_input
+
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    def set_surface_extractor(self, mc_algo):
+        if mc_algo is None:
+            return
+        logger.info('The parameters `mc_algo` is deprecated, and will be removed in future versions.\n'
+                    'Please use: \n'
+                    'from hy3dgen.shapegen.models.autoencoders import SurfaceExtractors\n'
+                    'pipeline.vae.surface_extractor = SurfaceExtractors[mc_algo]() instead\n')
+        if mc_algo not in SurfaceExtractors.keys():
+            raise ValueError(f"Unknown mc_algo {mc_algo}")
+        self.vae.surface_extractor = SurfaceExtractors[mc_algo]()
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[str, List[str], Image.Image] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        eta: float = 0.0,
+        guidance_scale: float = 7.5,
+        dual_guidance_scale: float = 10.5,
+        dual_guidance: bool = True,
+        generator=None,
+        box_v=1.01,
+        octree_resolution=384,
+        mc_level=-1 / 512,
+        num_chunks=8000,
+        mc_algo=None,
+        output_type: Optional[str] = "trimesh",
+        enable_pbar=True,
+        **kwargs,
+    ) -> List[List[trimesh.Trimesh]]:
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        self.set_surface_extractor(mc_algo)
+
+        device = self.device
+        dtype = self.dtype
+        do_classifier_free_guidance = guidance_scale >= 0 and \
+                                      getattr(self.model, 'guidance_cond_proj_dim', None) is None
+        dual_guidance = dual_guidance_scale >= 0 and dual_guidance
+
+        cond_inputs = self.prepare_image(image)
+        image = cond_inputs.pop('image')
+        cond = self.encode_cond(
+            image=image,
+            additional_cond_inputs=cond_inputs,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            dual_guidance=False,
+        )
+        batch_size = image.shape[0]
+
+        t_dtype = torch.long
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas)
+
+        latents = self.prepare_latents(batch_size, dtype, device, generator)
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        guidance_cond = None
+        if getattr(self.model, 'guidance_cond_proj_dim', None) is not None:
+            logger.info('Using lcm guidance scale')
+            guidance_scale_tensor = torch.tensor(guidance_scale - 1).repeat(batch_size)
+            guidance_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.model.guidance_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        with synchronize_timer('Diffusion Sampling'):
+            for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:", leave=False)):
+                # expand the latents if we are doing classifier free guidance
+                if do_classifier_free_guidance:
+                    latent_model_input = torch.cat([latents] * (3 if dual_guidance else 2))
+                else:
+                    latent_model_input = latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                timestep_tensor = torch.tensor([t], dtype=t_dtype, device=device)
+                timestep_tensor = timestep_tensor.expand(latent_model_input.shape[0])
+                noise_pred = self.model(latent_model_input, timestep_tensor, cond, guidance_cond=guidance_cond)
+
+                # no drop, drop clip, all drop
+                if do_classifier_free_guidance:
+                    if dual_guidance:
+                        noise_pred_clip, noise_pred_dino, noise_pred_uncond = noise_pred.chunk(3)
+                        noise_pred = (
+                            noise_pred_uncond
+                            + guidance_scale * (noise_pred_clip - noise_pred_dino)
+                            + dual_guidance_scale * (noise_pred_dino - noise_pred_uncond)
+                        )
+                    else:
+                        noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                outputs = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)
+                latents = outputs.prev_sample
+
+                if callback is not None and i % callback_steps == 0:
+                    step_idx = i // getattr(self.scheduler, "order", 1)
+                    callback(step_idx, t, outputs)
+
+        return self._export(
+            latents,
+            output_type,
+            box_v, mc_level, num_chunks, octree_resolution, mc_algo,
+        )
+
+    def _export(
+        self,
+        latents,
+        output_type='trimesh',
+        box_v=1.01,
+        mc_level=0.0,
+        num_chunks=20000,
+        octree_resolution=256,
+        mc_algo='mc',
+        enable_pbar=True
+    ):
+        if not output_type == "latent":
+            latents = 1. / self.vae.scale_factor * latents
+            latents = self.vae(latents)
+            outputs = self.vae.latents2mesh(
+                latents,
+                bounds=box_v,
+                mc_level=mc_level,
+                num_chunks=num_chunks,
+                octree_resolution=octree_resolution,
+                mc_algo=mc_algo,
+                enable_pbar=enable_pbar,
+            )
+        else:
+            outputs = latents
+
+        if output_type == 'trimesh':
+            outputs = export_to_trimesh(outputs)
+
+        return outputs
+
+
+class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
+
+    @torch.inference_mode()
+    def __call__(
+        self,
+        image: Union[str, List[str], Image.Image, dict, List[dict]] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        eta: float = 0.0,
+        guidance_scale: float = 5.0,
+        generator=None,
+        box_v=1.01,
+        octree_resolution=384,
+        mc_level=0.0,
+        mc_algo=None,
+        num_chunks=8000,
+        output_type: Optional[str] = "trimesh",
+        enable_pbar=True,
+        **kwargs,
+    ) -> List[List[trimesh.Trimesh]]:
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        self.set_surface_extractor(mc_algo)
+
+        device = self.device
+        dtype = self.dtype
+        do_classifier_free_guidance = guidance_scale >= 0 and not (
+            hasattr(self.model, 'guidance_embed') and
+            self.model.guidance_embed is True
+        )
+
+        cond_inputs = self.prepare_image(image)
+        image = cond_inputs.pop('image')
+        cond = self.encode_cond(
+            image=image,
+            additional_cond_inputs=cond_inputs,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            dual_guidance=False,
+        )
+        batch_size = image.shape[0]
+
+        # 5. Prepare timesteps
+        # NOTE: this is slightly different from common usage, we start from 0.
+        sigmas = np.linspace(0, 1, num_inference_steps) if sigmas is None else sigmas
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+        )
+        latents = self.prepare_latents(batch_size, dtype, device, generator)
+
+        guidance = None
+        if hasattr(self.model, 'guidance_embed') and \
+            self.model.guidance_embed is True:
+            guidance = torch.tensor([guidance_scale] * batch_size, device=device, dtype=dtype)
+            # logger.info(f'Using guidance embed with scale {guidance_scale}')
+
+        with synchronize_timer('Diffusion Sampling'):
+            for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:")):
+                # expand the latents if we are doing classifier free guidance
+                if do_classifier_free_guidance:
+                    latent_model_input = torch.cat([latents] * 2)
+                else:
+                    latent_model_input = latents
+
+                # NOTE: we assume model get timesteps ranged from 0 to 1
+                timestep = t.expand(latent_model_input.shape[0]).to(
+                    latents.dtype) / self.scheduler.config.num_train_timesteps
+                noise_pred = self.model(latent_model_input, timestep, cond, guidance=guidance)
+
+                if do_classifier_free_guidance:
+                    noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                outputs = self.scheduler.step(noise_pred, t, latents)
+                latents = outputs.prev_sample
+
+                if callback is not None and i % callback_steps == 0:
+                    step_idx = i // getattr(self.scheduler, "order", 1)
+                    callback(step_idx, t, outputs)
+
+        return self._export(
+            latents,
+            output_type,
+            box_v, mc_level, num_chunks, octree_resolution, mc_algo,
+            enable_pbar=enable_pbar,
+        )
diff --git a/hy3dgen/shapegen/postprocessors.py b/hy3dgen/shapegen/postprocessors.py
new file mode 100644
index 0000000..d258369
--- /dev/null
+++ b/hy3dgen/shapegen/postprocessors.py
@@ -0,0 +1,202 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import os
+import tempfile
+from typing import Union
+
+import numpy as np
+import pymeshlab
+import torch
+import trimesh
+
+from .models.autoencoders import Latent2MeshOutput
+from .utils import synchronize_timer
+
+
+def load_mesh(path):
+    if path.endswith(".glb"):
+        mesh = trimesh.load(path)
+    else:
+        mesh = pymeshlab.MeshSet()
+        mesh.load_new_mesh(path)
+    return mesh
+
+
+def reduce_face(mesh: pymeshlab.MeshSet, max_facenum: int = 200000):
+    if max_facenum > mesh.current_mesh().face_number():
+        return mesh
+
+    mesh.apply_filter(
+        "meshing_decimation_quadric_edge_collapse",
+        targetfacenum=max_facenum,
+        qualitythr=1.0,
+        preserveboundary=True,
+        boundaryweight=3,
+        preservenormal=True,
+        preservetopology=True,
+        autoclean=True
+    )
+    return mesh
+
+
+def remove_floater(mesh: pymeshlab.MeshSet):
+    mesh.apply_filter("compute_selection_by_small_disconnected_components_per_face",
+                      nbfaceratio=0.005)
+    mesh.apply_filter("compute_selection_transfer_face_to_vertex", inclusive=False)
+    mesh.apply_filter("meshing_remove_selected_vertices_and_faces")
+    return mesh
+
+
+def pymeshlab2trimesh(mesh: pymeshlab.MeshSet):
+    with tempfile.NamedTemporaryFile(suffix='.ply', delete=False) as temp_file:
+        mesh.save_current_mesh(temp_file.name)
+        mesh = trimesh.load(temp_file.name)
+    # 检查加载的对象类型
+    if isinstance(mesh, trimesh.Scene):
+        combined_mesh = trimesh.Trimesh()
+        # 如果是Scene，遍历所有的geometry并合并
+        for geom in mesh.geometry.values():
+            combined_mesh = trimesh.util.concatenate([combined_mesh, geom])
+        mesh = combined_mesh
+    return mesh
+
+
+def trimesh2pymeshlab(mesh: trimesh.Trimesh):
+    with tempfile.NamedTemporaryFile(suffix='.ply', delete=False) as temp_file:
+        if isinstance(mesh, trimesh.scene.Scene):
+            for idx, obj in enumerate(mesh.geometry.values()):
+                if idx == 0:
+                    temp_mesh = obj
+                else:
+                    temp_mesh = temp_mesh + obj
+            mesh = temp_mesh
+        mesh.export(temp_file.name)
+        mesh = pymeshlab.MeshSet()
+        mesh.load_new_mesh(temp_file.name)
+    return mesh
+
+
+def export_mesh(input, output):
+    if isinstance(input, pymeshlab.MeshSet):
+        mesh = output
+    elif isinstance(input, Latent2MeshOutput):
+        output = Latent2MeshOutput()
+        output.mesh_v = output.current_mesh().vertex_matrix()
+        output.mesh_f = output.current_mesh().face_matrix()
+        mesh = output
+    else:
+        mesh = pymeshlab2trimesh(output)
+    return mesh
+
+
+def import_mesh(mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str]) -> pymeshlab.MeshSet:
+    if isinstance(mesh, str):
+        mesh = load_mesh(mesh)
+    elif isinstance(mesh, Latent2MeshOutput):
+        mesh = pymeshlab.MeshSet()
+        mesh_pymeshlab = pymeshlab.Mesh(vertex_matrix=mesh.mesh_v, face_matrix=mesh.mesh_f)
+        mesh.add_mesh(mesh_pymeshlab, "converted_mesh")
+
+    if isinstance(mesh, (trimesh.Trimesh, trimesh.scene.Scene)):
+        mesh = trimesh2pymeshlab(mesh)
+
+    return mesh
+
+
+class FaceReducer:
+    @synchronize_timer('FaceReducer')
+    def __call__(
+        self,
+        mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
+        max_facenum: int = 40000
+    ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh]:
+        ms = import_mesh(mesh)
+        ms = reduce_face(ms, max_facenum=max_facenum)
+        mesh = export_mesh(mesh, ms)
+        return mesh
+
+
+class FloaterRemover:
+    @synchronize_timer('FloaterRemover')
+    def __call__(
+        self,
+        mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
+    ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]:
+        ms = import_mesh(mesh)
+        ms = remove_floater(ms)
+        mesh = export_mesh(mesh, ms)
+        return mesh
+
+
+class DegenerateFaceRemover:
+    @synchronize_timer('DegenerateFaceRemover')
+    def __call__(
+        self,
+        mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str],
+    ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]:
+        ms = import_mesh(mesh)
+
+        with tempfile.NamedTemporaryFile(suffix='.ply', delete=False) as temp_file:
+            ms.save_current_mesh(temp_file.name)
+            ms = pymeshlab.MeshSet()
+            ms.load_new_mesh(temp_file.name)
+
+        mesh = export_mesh(mesh, ms)
+        return mesh
+
+
+def mesh_normalize(mesh):
+    """
+    Normalize mesh vertices to sphere
+    """
+    scale_factor = 1.2
+    vtx_pos = np.asarray(mesh.vertices)
+    max_bb = (vtx_pos - 0).max(0)[0]
+    min_bb = (vtx_pos - 0).min(0)[0]
+
+    center = (max_bb + min_bb) / 2
+
+    scale = torch.norm(torch.tensor(vtx_pos - center, dtype=torch.float32), dim=1).max() * 2.0
+
+    vtx_pos = (vtx_pos - center) * (scale_factor / float(scale))
+    mesh.vertices = vtx_pos
+
+    return mesh
+
+
+class MeshSimplifier:
+    def __init__(self, executable: str = None):
+        if executable is None:
+            CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+            executable = os.path.join(CURRENT_DIR, "mesh_simplifier.bin")
+        self.executable = executable
+
+    @synchronize_timer('MeshSimplifier')
+    def __call__(
+        self,
+        mesh: Union[trimesh.Trimesh],
+    ) -> Union[trimesh.Trimesh]:
+        with tempfile.NamedTemporaryFile(suffix='.obj', delete=False) as temp_input:
+            with tempfile.NamedTemporaryFile(suffix='.obj', delete=False) as temp_output:
+                mesh.export(temp_input.name)
+                os.system(f'{self.executable} {temp_input.name} {temp_output.name}')
+                ms = trimesh.load(temp_output.name, process=False)
+                if isinstance(ms, trimesh.Scene):
+                    combined_mesh = trimesh.Trimesh()
+                    for geom in ms.geometry.values():
+                        combined_mesh = trimesh.util.concatenate([combined_mesh, geom])
+                    ms = combined_mesh
+                ms = mesh_normalize(ms)
+                return ms
diff --git a/hy3dgen/shapegen/preprocessors.py b/hy3dgen/shapegen/preprocessors.py
new file mode 100644
index 0000000..8a9cb9e
--- /dev/null
+++ b/hy3dgen/shapegen/preprocessors.py
@@ -0,0 +1,167 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from einops import repeat, rearrange
+
+
+def array_to_tensor(np_array):
+    image_pt = torch.tensor(np_array).float()
+    image_pt = image_pt / 255 * 2 - 1
+    image_pt = rearrange(image_pt, "h w c -> c h w")
+    image_pts = repeat(image_pt, "c h w -> b c h w", b=1)
+    return image_pts
+
+
+class ImageProcessorV2:
+    def __init__(self, size=512, border_ratio=None):
+        self.size = size
+        self.border_ratio = border_ratio
+
+    @staticmethod
+    def recenter(image, border_ratio: float = 0.2):
+        """ recenter an image to leave some empty space at the image border.
+
+        Args:
+            image (ndarray): input image, float/uint8 [H, W, 3/4]
+            mask (ndarray): alpha mask, bool [H, W]
+            border_ratio (float, optional): border ratio, image will be resized to (1 - border_ratio). Defaults to 0.2.
+
+        Returns:
+            ndarray: output image, float/uint8 [H, W, 3/4]
+        """
+
+        if image.shape[-1] == 4:
+            mask = image[..., 3]
+        else:
+            mask = np.ones_like(image[..., 0:1]) * 255
+            image = np.concatenate([image, mask], axis=-1)
+            mask = mask[..., 0]
+
+        H, W, C = image.shape
+
+        size = max(H, W)
+        result = np.zeros((size, size, C), dtype=np.uint8)
+
+        coords = np.nonzero(mask)
+        x_min, x_max = coords[0].min(), coords[0].max()
+        y_min, y_max = coords[1].min(), coords[1].max()
+        h = x_max - x_min
+        w = y_max - y_min
+        if h == 0 or w == 0:
+            raise ValueError('input image is empty')
+        desired_size = int(size * (1 - border_ratio))
+        scale = desired_size / max(h, w)
+        h2 = int(h * scale)
+        w2 = int(w * scale)
+        x2_min = (size - h2) // 2
+        x2_max = x2_min + h2
+
+        y2_min = (size - w2) // 2
+        y2_max = y2_min + w2
+
+        result[x2_min:x2_max, y2_min:y2_max] = cv2.resize(image[x_min:x_max, y_min:y_max], (w2, h2),
+                                                          interpolation=cv2.INTER_AREA)
+
+        bg = np.ones((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255
+
+        mask = result[..., 3:].astype(np.float32) / 255
+        result = result[..., :3] * mask + bg * (1 - mask)
+
+        mask = mask * 255
+        result = result.clip(0, 255).astype(np.uint8)
+        mask = mask.clip(0, 255).astype(np.uint8)
+        return result, mask
+
+    def load_image(self, image, border_ratio=0.15, to_tensor=True):
+        if isinstance(image, str):
+            image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
+            image, mask = self.recenter(image, border_ratio=border_ratio)
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        elif isinstance(image, Image.Image):
+            image = image.convert("RGBA")
+            image = np.asarray(image)
+            image, mask = self.recenter(image, border_ratio=border_ratio)
+
+        image = cv2.resize(image, (self.size, self.size), interpolation=cv2.INTER_CUBIC)
+        mask = cv2.resize(mask, (self.size, self.size), interpolation=cv2.INTER_NEAREST)
+        mask = mask[..., np.newaxis]
+
+        if to_tensor:
+            image = array_to_tensor(image)
+            mask = array_to_tensor(mask)
+        return image, mask
+
+    def __call__(self, image, border_ratio=0.15, to_tensor=True, **kwargs):
+        if self.border_ratio is not None:
+            border_ratio = self.border_ratio
+        image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor)
+        outputs = {
+            'image': image,
+            'mask': mask
+        }
+        return outputs
+
+
+class MVImageProcessorV2(ImageProcessorV2):
+    """
+    view order: front, front clockwise 90, back, front clockwise 270
+    """
+    return_view_idx = True
+
+    def __init__(self, size=512, border_ratio=None):
+        super().__init__(size, border_ratio)
+        self.view2idx = {
+            'front': 0,
+            'left': 1,
+            'back': 2,
+            'right': 3
+        }
+
+    def __call__(self, image_dict, border_ratio=0.15, to_tensor=True, **kwargs):
+        if self.border_ratio is not None:
+            border_ratio = self.border_ratio
+
+        images = []
+        masks = []
+        view_idxs = []
+        for idx, (view_tag, image) in enumerate(image_dict.items()):
+            view_idxs.append(self.view2idx[view_tag])
+            image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor)
+            images.append(image)
+            masks.append(mask)
+
+        zipped_lists = zip(view_idxs, images, masks)
+        sorted_zipped_lists = sorted(zipped_lists)
+        view_idxs, images, masks = zip(*sorted_zipped_lists)
+
+        image = torch.cat(images, 0).unsqueeze(0)
+        mask = torch.cat(masks, 0).unsqueeze(0)
+        outputs = {
+            'image': image,
+            'mask': mask,
+            'view_idxs': view_idxs
+        }
+        return outputs
+
+
+IMAGE_PROCESSORS = {
+    "v2": ImageProcessorV2,
+    'mv_v2': MVImageProcessorV2,
+}
+
+DEFAULT_IMAGEPROCESSOR = 'v2'
diff --git a/hy3dgen/shapegen/schedulers.py b/hy3dgen/shapegen/schedulers.py
new file mode 100644
index 0000000..13f0da8
--- /dev/null
+++ b/hy3dgen/shapegen/schedulers.py
@@ -0,0 +1,480 @@
+# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils import BaseOutput, logging
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.FloatTensor
+
+
+class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    NOTE: this is very similar to diffusers.FlowMatchEulerDiscreteScheduler. Except our timesteps are reversed
+
+    Euler scheduler.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        shift (`float`, defaults to 1.0):
+            The shift value for the timestep schedule.
+    """
+
+    _compatibles = []
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        shift: float = 1.0,
+        use_dynamic_shifting=False,
+    ):
+        timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32).copy()
+        timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
+
+        sigmas = timesteps / num_train_timesteps
+        if not use_dynamic_shifting:
+            # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
+            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
+
+        self.timesteps = sigmas * num_train_timesteps
+
+        self._step_index = None
+        self._begin_index = None
+
+        self.sigmas = sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+        self.sigma_min = self.sigmas[-1].item()
+        self.sigma_max = self.sigmas[0].item()
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_noise(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        noise: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Forward process in flow-matching
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype)
+
+        if sample.device.type == "mps" and torch.is_floating_point(timestep):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32)
+            timestep = timestep.to(sample.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(sample.device)
+            timestep = timestep.to(sample.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timestep.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timestep.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(sample.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        sample = sigma * noise + (1.0 - sigma) * sample
+
+        return sample
+
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+
+    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
+        return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int = None,
+        device: Union[str, torch.device] = None,
+        sigmas: Optional[List[float]] = None,
+        mu: Optional[float] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+
+        if self.config.use_dynamic_shifting and mu is None:
+            raise ValueError(" you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`")
+
+        if sigmas is None:
+            self.num_inference_steps = num_inference_steps
+            timesteps = np.linspace(
+                self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps
+            )
+
+            sigmas = timesteps / self.config.num_train_timesteps
+
+        if self.config.use_dynamic_shifting:
+            sigmas = self.time_shift(mu, 1.0, sigmas)
+        else:
+            sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas)
+
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
+        timesteps = sigmas * self.config.num_train_timesteps
+
+        self.timesteps = timesteps.to(device=device)
+        self.sigmas = torch.cat([sigmas, torch.ones(1, device=sigmas.device)])
+
+        self._step_index = None
+        self._begin_index = None
+
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        s_churn: float = 0.0,
+        s_tmin: float = 0.0,
+        s_tmax: float = float("inf"),
+        s_noise: float = 1.0,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            s_churn (`float`):
+            s_tmin  (`float`):
+            s_tmax  (`float`):
+            s_noise (`float`, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+
+        sigma = self.sigmas[self.step_index]
+        sigma_next = self.sigmas[self.step_index + 1]
+
+        prev_sample = sample + (sigma_next - sigma) * model_output
+
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+
+@dataclass
+class ConsistencyFlowMatchEulerDiscreteSchedulerOutput(BaseOutput):
+    prev_sample: torch.FloatTensor
+    pred_original_sample: torch.FloatTensor
+
+
+class ConsistencyFlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    _compatibles = []
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        pcm_timesteps: int = 50,
+    ):
+        sigmas = np.linspace(0, 1, num_train_timesteps)
+        step_ratio = num_train_timesteps // pcm_timesteps
+
+        euler_timesteps = (np.arange(1, pcm_timesteps) * step_ratio).round().astype(np.int64) - 1
+        euler_timesteps = np.asarray([0] + euler_timesteps.tolist())
+
+        self.euler_timesteps = euler_timesteps
+        self.sigmas = sigmas[self.euler_timesteps]
+        self.sigmas = torch.from_numpy((self.sigmas.copy())).to(dtype=torch.float32)
+        self.timesteps = self.sigmas * num_train_timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int = None,
+        device: Union[str, torch.device] = None,
+        sigmas: Optional[List[float]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps if num_inference_steps is not None else len(sigmas)
+        inference_indices = np.linspace(
+            0, self.config.pcm_timesteps, num=self.num_inference_steps, endpoint=False
+        )
+        inference_indices = np.floor(inference_indices).astype(np.int64)
+        inference_indices = torch.from_numpy(inference_indices).long()
+
+        self.sigmas_ = self.sigmas[inference_indices]
+        timesteps = self.sigmas_ * self.config.num_train_timesteps
+        self.timesteps = timesteps.to(device=device)
+        self.sigmas_ = torch.cat(
+            [self.sigmas_, torch.ones(1, device=self.sigmas_.device)]
+        )
+
+        self._step_index = None
+        self._begin_index = None
+
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[ConsistencyFlowMatchEulerDiscreteSchedulerOutput, Tuple]:
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sample = sample.to(torch.float32)
+
+        sigma = self.sigmas_[self.step_index]
+        sigma_next = self.sigmas_[self.step_index + 1]
+
+        prev_sample = sample + (sigma_next - sigma) * model_output
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        pred_original_sample = sample + (1.0 - sigma) * model_output
+        pred_original_sample = pred_original_sample.to(model_output.dtype)
+
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return ConsistencyFlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample,
+                                                                pred_original_sample=pred_original_sample)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/hy3dgen/shapegen/utils.py b/hy3dgen/shapegen/utils.py
new file mode 100644
index 0000000..6ac8f5d
--- /dev/null
+++ b/hy3dgen/shapegen/utils.py
@@ -0,0 +1,126 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import logging
+import os
+from functools import wraps
+
+import torch
+
+
+def get_logger(name):
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    console_handler.setFormatter(formatter)
+    logger.addHandler(console_handler)
+    return logger
+
+
+logger = get_logger('hy3dgen.shapgen')
+
+
+class synchronize_timer:
+    """ Synchronized timer to count the inference time of `nn.Module.forward`.
+
+        Supports both context manager and decorator usage.
+
+        Example as context manager:
+        ```python
+        with synchronize_timer('name') as t:
+            run()
+        ```
+
+        Example as decorator:
+        ```python
+        @synchronize_timer('Export to trimesh')
+        def export_to_trimesh(mesh_output):
+            pass
+        ```
+    """
+
+    def __init__(self, name=None):
+        self.name = name
+
+    def __enter__(self):
+        """Context manager entry: start timing."""
+        if os.environ.get('HY3DGEN_DEBUG', '0') == '1':
+            self.start = torch.cuda.Event(enable_timing=True)
+            self.end = torch.cuda.Event(enable_timing=True)
+            self.start.record()
+            return lambda: self.time
+
+    def __exit__(self, exc_type, exc_value, exc_tb):
+        """Context manager exit: stop timing and log results."""
+        if os.environ.get('HY3DGEN_DEBUG', '0') == '1':
+            self.end.record()
+            torch.cuda.synchronize()
+            self.time = self.start.elapsed_time(self.end)
+            if self.name is not None:
+                logger.info(f'{self.name} takes {self.time} ms')
+
+    def __call__(self, func):
+        """Decorator: wrap the function to time its execution."""
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            with self:
+                result = func(*args, **kwargs)
+            return result
+
+        return wrapper
+
+
+def smart_load_model(
+    model_path,
+    subfolder,
+    use_safetensors,
+    variant,
+):
+    original_model_path = model_path
+    # try local path
+    base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen')
+    model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder))
+    logger.info(f'Try to load model from local path: {model_path}')
+    if not os.path.exists(model_path):
+        logger.info('Model path not exists, try to download from huggingface')
+        try:
+            from huggingface_hub import snapshot_download
+            # 只下载指定子目录
+            path = snapshot_download(
+                repo_id=original_model_path,
+                allow_patterns=[f"{subfolder}/*"],  # 关键修改：模式匹配子文件夹
+            )
+            model_path = os.path.join(path, subfolder)  # 保持路径拼接逻辑不变
+        except ImportError:
+            logger.warning(
+                "You need to install HuggingFace Hub to load models from the hub."
+            )
+            raise RuntimeError(f"Model path {model_path} not found")
+        except Exception as e:
+            raise e
+
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"Model path {original_model_path} not found")
+
+    extension = 'ckpt' if not use_safetensors else 'safetensors'
+    variant = '' if variant is None else f'.{variant}'
+    ckpt_name = f'model{variant}.{extension}'
+    config_path = os.path.join(model_path, 'config.yaml')
+    ckpt_path = os.path.join(model_path, ckpt_name)
+    return config_path, ckpt_path
diff --git a/hy3dgen/texgen/__init__.py b/hy3dgen/texgen/__init__.py
new file mode 100644
index 0000000..7054c57
--- /dev/null
+++ b/hy3dgen/texgen/__init__.py
@@ -0,0 +1,16 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+
+from .pipelines import Hunyuan3DPaintPipeline, Hunyuan3DTexGenConfig
diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/__init__.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/__init__.py
new file mode 100644
index 0000000..f471e1a
--- /dev/null
+++ b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/__init__.py
@@ -0,0 +1,22 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+'''
+from .hierarchy import BuildHierarchy, BuildHierarchyWithColor
+from .io_obj import LoadObj, LoadObjWithTexture
+from .render import rasterize, interpolate
+'''
+from .io_glb import *
+from .io_obj import *
+from .render import *
diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_glb.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_glb.py
new file mode 100644
index 0000000..f1daf7f
--- /dev/null
+++ b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_glb.py
@@ -0,0 +1,241 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import base64
+import io
+import os
+
+import numpy as np
+from PIL import Image as PILImage
+from pygltflib import GLTF2
+from scipy.spatial.transform import Rotation as R
+
+
+# Function to extract buffer data
+def get_buffer_data(gltf, buffer_view):
+    buffer = gltf.buffers[buffer_view.buffer]
+    buffer_data = gltf.get_data_from_buffer_uri(buffer.uri)
+    byte_offset = buffer_view.byteOffset if buffer_view.byteOffset else 0
+    byte_length = buffer_view.byteLength
+    return buffer_data[byte_offset:byte_offset + byte_length]
+
+
+# Function to extract attribute data
+def get_attribute_data(gltf, accessor_index):
+    accessor = gltf.accessors[accessor_index]
+    buffer_view = gltf.bufferViews[accessor.bufferView]
+    buffer_data = get_buffer_data(gltf, buffer_view)
+
+    comptype = {5120: np.int8, 5121: np.uint8, 5122: np.int16, 5123: np.uint16, 5125: np.uint32, 5126: np.float32}
+    dtype = comptype[accessor.componentType]
+
+    t2n = {'SCALAR': 1, 'VEC2': 2, 'VEC3': 3, 'VEC4': 4, 'MAT2': 4, 'MAT3': 9, 'MAT4': 16}
+    num_components = t2n[accessor.type]
+
+    # Calculate the correct slice of data
+    byte_offset = accessor.byteOffset if accessor.byteOffset else 0
+    byte_stride = buffer_view.byteStride if buffer_view.byteStride else num_components * np.dtype(dtype).itemsize
+    count = accessor.count
+
+    # Extract the attribute data
+    attribute_data = np.zeros((count, num_components), dtype=dtype)
+    for i in range(count):
+        start = byte_offset + i * byte_stride
+        end = start + num_components * np.dtype(dtype).itemsize
+        attribute_data[i] = np.frombuffer(buffer_data[start:end], dtype=dtype)
+
+    return attribute_data
+
+
+# Function to extract image data
+def get_image_data(gltf, image, folder):
+    if image.uri:
+        if image.uri.startswith('data:'):
+            # Data URI
+            header, encoded = image.uri.split(',', 1)
+            data = base64.b64decode(encoded)
+        else:
+            # External file
+            fn = image.uri
+            if not os.path.isabs(fn):
+                fn = folder + '/' + fn
+            with open(fn, 'rb') as f:
+                data = f.read()
+    else:
+        buffer_view = gltf.bufferViews[image.bufferView]
+        data = get_buffer_data(gltf, buffer_view)
+    return data
+
+
+# Function to convert triangle strip to triangles
+def convert_triangle_strip_to_triangles(indices):
+    triangles = []
+    for i in range(len(indices) - 2):
+        if i % 2 == 0:
+            triangles.append([indices[i], indices[i + 1], indices[i + 2]])
+        else:
+            triangles.append([indices[i], indices[i + 2], indices[i + 1]])
+    return np.array(triangles).reshape(-1, 3)
+
+
+# Function to convert triangle fan to triangles
+def convert_triangle_fan_to_triangles(indices):
+    triangles = []
+    for i in range(1, len(indices) - 1):
+        triangles.append([indices[0], indices[i], indices[i + 1]])
+    return np.array(triangles).reshape(-1, 3)
+
+
+# Function to get the transformation matrix from a node
+def get_node_transform(node):
+    if node.matrix:
+        return np.array(node.matrix).reshape(4, 4).T
+    else:
+        T = np.eye(4)
+        if node.translation:
+            T[:3, 3] = node.translation
+        if node.rotation:
+            R_mat = R.from_quat(node.rotation).as_matrix()
+            T[:3, :3] = R_mat
+        if node.scale:
+            S = np.diag(node.scale + [1])
+            T = T @ S
+        return T
+
+
+def get_world_transform(gltf, node_index, parents, world_transforms):
+    if parents[node_index] == -2:
+        return world_transforms[node_index]
+
+    node = gltf.nodes[node_index]
+    if parents[node_index] == -1:
+        world_transforms[node_index] = get_node_transform(node)
+        parents[node_index] = -2
+        return world_transforms[node_index]
+
+    parent_index = parents[node_index]
+    parent_transform = get_world_transform(gltf, parent_index, parents, world_transforms)
+    world_transforms[node_index] = parent_transform @ get_node_transform(node)
+    parents[node_index] = -2
+    return world_transforms[node_index]
+
+
+def LoadGlb(path):
+    # Load the GLB file using pygltflib
+    gltf = GLTF2().load(path)
+
+    primitives = []
+    images = {}
+    # Iterate through the meshes in the GLB file
+
+    world_transforms = [np.identity(4) for i in range(len(gltf.nodes))]
+    parents = [-1 for i in range(len(gltf.nodes))]
+    for node_index, node in enumerate(gltf.nodes):
+        for idx in node.children:
+            parents[idx] = node_index
+    # for i in range(len(gltf.nodes)):
+    #    get_world_transform(gltf, i, parents, world_transform)
+
+    for node_index, node in enumerate(gltf.nodes):
+        if node.mesh is not None:
+            world_transform = get_world_transform(gltf, node_index, parents, world_transforms)
+            # Iterate through the primitives in the mesh
+            mesh = gltf.meshes[node.mesh]
+            for primitive in mesh.primitives:
+                # Access the attributes of the primitive
+                attributes = primitive.attributes.__dict__
+                mode = primitive.mode if primitive.mode is not None else 4  # Default to TRIANGLES
+                result = {}
+                if primitive.indices is not None:
+                    indices = get_attribute_data(gltf, primitive.indices)
+                    if mode == 4:  # TRIANGLES
+                        face_indices = indices.reshape(-1, 3)
+                    elif mode == 5:  # TRIANGLE_STRIP
+                        face_indices = convert_triangle_strip_to_triangles(indices)
+                    elif mode == 6:  # TRIANGLE_FAN
+                        face_indices = convert_triangle_fan_to_triangles(indices)
+                    else:
+                        continue
+                    result['F'] = face_indices
+
+                # Extract vertex positions
+                if 'POSITION' in attributes and attributes['POSITION'] is not None:
+                    positions = get_attribute_data(gltf, attributes['POSITION'])
+                    # Apply the world transformation to the positions
+                    positions_homogeneous = np.hstack([positions, np.ones((positions.shape[0], 1))])
+                    transformed_positions = (world_transform @ positions_homogeneous.T).T[:, :3]
+                    result['V'] = transformed_positions
+
+                # Extract vertex colors
+                if 'COLOR_0' in attributes and attributes['COLOR_0'] is not None:
+                    colors = get_attribute_data(gltf, attributes['COLOR_0'])
+                    if colors.shape[-1] > 3:
+                        colors = colors[..., :3]
+                    result['VC'] = colors
+
+                # Extract UVs
+                if 'TEXCOORD_0' in attributes and not attributes['TEXCOORD_0'] is None:
+                    uvs = get_attribute_data(gltf, attributes['TEXCOORD_0'])
+                    result['UV'] = uvs
+
+                if primitive.material is not None:
+                    material = gltf.materials[primitive.material]
+                    if (
+                        material.pbrMetallicRoughness is not None 
+                        and material.pbrMetallicRoughness.baseColorTexture is not None
+                    ):
+                        texture_index = material.pbrMetallicRoughness.baseColorTexture.index
+                        texture = gltf.textures[texture_index]
+                        image_index = texture.source
+                        if not image_index in images:
+                            image = gltf.images[image_index]
+                            image_data = get_image_data(gltf, image, os.path.dirname(path))
+                            pil_image = PILImage.open(io.BytesIO(image_data))
+                            if pil_image.mode != 'RGB':
+                                pil_image = pil_image.convert('RGB')
+                            images[image_index] = pil_image
+                        result['TEX'] = image_index
+                    elif material.emissiveTexture is not None:
+                        texture_index = material.emissiveTexture.index
+                        texture = gltf.textures[texture_index]
+                        image_index = texture.source
+                        if not image_index in images:
+                            image = gltf.images[image_index]
+                            image_data = get_image_data(gltf, image, os.path.dirname(path))
+                            pil_image = PILImage.open(io.BytesIO(image_data))
+                            if pil_image.mode != 'RGB':
+                                pil_image = pil_image.convert('RGB')
+                            images[image_index] = pil_image
+                        result['TEX'] = image_index
+                    else:
+                        if material.pbrMetallicRoughness is not None:
+                            base_color = material.pbrMetallicRoughness.baseColorFactor
+                        else:
+                            base_color = np.array([0.8, 0.8, 0.8], dtype=np.float32)
+                        result['MC'] = base_color
+
+                primitives.append(result)
+
+    return primitives, images
+
+
+def RotatePrimitives(primitives, transform):
+    for i in range(len(primitives)):
+        if 'V' in primitives[i]:
+            primitives[i]['V'] = primitives[i]['V'] @ transform.T
+
+
+if __name__ == '__main__':
+    path = 'data/test.glb'
+    LoadGlb(path)
diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_obj.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_obj.py
new file mode 100644
index 0000000..e40d500
--- /dev/null
+++ b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_obj.py
@@ -0,0 +1,66 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import cv2
+import numpy as np
+
+
+def LoadObj(fn):
+    lines = [l.strip() for l in open(fn)]
+    vertices = []
+    faces = []
+    for l in lines:
+        words = [w for w in l.split(' ') if w != '']
+        if len(words) == 0:
+            continue
+        if words[0] == 'v':
+            v = [float(words[i]) for i in range(1, 4)]
+            vertices.append(v)
+        elif words[0] == 'f':
+            f = [int(words[i]) - 1 for i in range(1, 4)]
+            faces.append(f)
+
+    return np.array(vertices).astype('float32'), np.array(faces).astype('int32')
+
+
+def LoadObjWithTexture(fn, tex_fn):
+    lines = [l.strip() for l in open(fn)]
+    vertices = []
+    vertex_textures = []
+    faces = []
+    face_textures = []
+    for l in lines:
+        words = [w for w in l.split(' ') if w != '']
+        if len(words) == 0:
+            continue
+        if words[0] == 'v':
+            v = [float(words[i]) for i in range(1, len(words))]
+            vertices.append(v)
+        elif words[0] == 'vt':
+            v = [float(words[i]) for i in range(1, len(words))]
+            vertex_textures.append(v)
+        elif words[0] == 'f':
+            f = []
+            ft = []
+            for i in range(1, len(words)):
+                t = words[i].split('/')
+                f.append(int(t[0]) - 1)
+                ft.append(int(t[1]) - 1)
+            for i in range(2, len(f)):
+                faces.append([f[0], f[i - 1], f[i]])
+                face_textures.append([ft[0], ft[i - 1], ft[i]])
+
+    tex_image = cv2.cvtColor(cv2.imread(tex_fn), cv2.COLOR_BGR2RGB)
+    return np.array(vertices).astype('float32'), np.array(vertex_textures).astype('float32'), np.array(faces).astype(
+        'int32'), np.array(face_textures).astype('int32'), tex_image
diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/render.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/render.py
new file mode 100644
index 0000000..2d4d3f7
--- /dev/null
+++ b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/render.py
@@ -0,0 +1,31 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import custom_rasterizer_kernel
+import torch
+
+
+def rasterize(pos, tri, resolution, clamp_depth=torch.zeros(0), use_depth_prior=0):
+    assert (pos.device == tri.device)
+    findices, barycentric = custom_rasterizer_kernel.rasterize_image(pos[0], tri, clamp_depth, resolution[1],
+                                                                     resolution[0], 1e-6, use_depth_prior)
+    return findices, barycentric
+
+
+def interpolate(col, findices, barycentric, tri):
+    f = findices - 1 + (findices == 0)
+    vcol = col[0, tri.long()[f.long()]]
+    result = barycentric.view(*barycentric.shape, 1) * vcol
+    result = torch.sum(result, axis=-2)
+    return result.view(1, *result.shape)
diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/__init__.py b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/__init__.py
new file mode 100644
index 0000000..1614ff8
--- /dev/null
+++ b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/__init__.py
@@ -0,0 +1,13 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp
new file mode 100644
index 0000000..65ab321
--- /dev/null
+++ b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp
@@ -0,0 +1,574 @@
+#include "rasterizer.h"
+#include <fstream>
+
+inline int pos2key(float* p, int resolution) {
+    int x = (p[0] * 0.5 + 0.5) * resolution;
+    int y = (p[1] * 0.5 + 0.5) * resolution;
+    int z = (p[2] * 0.5 + 0.5) * resolution;
+    return (x * resolution + y) * resolution + z;
+}
+
+inline void key2pos(int key, int resolution, float* p) {
+    int x = key / resolution / resolution;
+    int y = key / resolution % resolution;
+    int z = key % resolution;
+    p[0] = ((x + 0.5) / resolution - 0.5) * 2;
+    p[1] = ((y + 0.5) / resolution - 0.5) * 2;
+    p[2] = ((z + 0.5) / resolution - 0.5) * 2;
+}
+
+inline void key2cornerpos(int key, int resolution, float* p) {
+    int x = key / resolution / resolution;
+    int y = key / resolution % resolution;
+    int z = key % resolution;
+    p[0] = ((x + 0.75) / resolution - 0.5) * 2;
+    p[1] = ((y + 0.25) / resolution - 0.5) * 2;
+    p[2] = ((z + 0.75) / resolution - 0.5) * 2;
+}
+
+inline float* pos_ptr(int l, int i, int j, torch::Tensor t) {
+    float* pdata = t.data_ptr<float>();
+    int height = t.size(1);
+    int width = t.size(2);
+    return &pdata[((l * height + i) * width + j) * 4];
+}
+
+struct Grid
+{
+    std::vector<int> seq2oddcorner;
+    std::vector<int> seq2evencorner;
+    std::vector<int> seq2grid;
+    std::vector<int> seq2normal;
+    std::vector<int> seq2neighbor;
+    std::unordered_map<int, int> grid2seq;
+    std::vector<int> downsample_seq;
+    int num_origin_seq;
+    int resolution;
+    int stride;
+};
+
+inline void pos_from_seq(Grid& grid, int seq, float* p) {
+    auto k = grid.seq2grid[seq];
+    key2pos(k, grid.resolution, p);
+}
+
+inline int fetch_seq(Grid& grid, int l, int i, int j, torch::Tensor pdata) {
+    float* p = pos_ptr(l, i, j, pdata);
+    if (p[3] == 0)
+        return -1;
+    auto key = pos2key(p, grid.resolution);
+    int seq = grid.grid2seq[key];
+    return seq;
+}
+
+inline int fetch_last_seq(Grid& grid, int i, int j, torch::Tensor pdata) {
+    int num_layers = pdata.size(0);
+    int l = 0;
+    int idx = fetch_seq(grid, l, i, j, pdata);
+    while (l < num_layers - 1) {
+        l += 1;
+        int new_idx = fetch_seq(grid, l, i, j, pdata);
+        if (new_idx == -1)
+            break;
+        idx = new_idx;
+    }
+    return idx;
+}
+
+inline int fetch_nearest_seq(Grid& grid, int i, int j, int dim, float d, torch::Tensor pdata) {
+    float p[3];
+    float max_dist = 1e10;
+    int best_idx = -1;
+    int num_layers = pdata.size(0);
+    for (int l = 0; l < num_layers; ++l) {
+        int idx = fetch_seq(grid, l, i, j, pdata);
+        if (idx == -1)
+            break;
+        pos_from_seq(grid, idx, p);
+        float dist = std::abs(d - p[(dim + 2) % 3]);
+        if (dist < max_dist) {
+            max_dist = dist;
+            best_idx = idx;
+        }
+    }
+    return best_idx;
+}
+
+inline int fetch_nearest_seq_layer(Grid& grid, int i, int j, int dim, float d, torch::Tensor pdata) {
+    float p[3];
+    float max_dist = 1e10;
+    int best_layer = -1;
+    int num_layers = pdata.size(0);
+    for (int l = 0; l < num_layers; ++l) {
+        int idx = fetch_seq(grid, l, i, j, pdata);
+        if (idx == -1)
+            break;
+        pos_from_seq(grid, idx, p);
+        float dist = std::abs(d - p[(dim + 2) % 3]);
+        if (dist < max_dist) {
+            max_dist = dist;
+            best_layer = l;
+        }
+    }
+    return best_layer;
+}
+
+void FetchNeighbor(Grid& grid, int seq, float* pos, int dim, int boundary_info, std::vector<torch::Tensor>& view_layer_positions,
+    int* output_indices)
+{
+    auto t = view_layer_positions[dim];
+    int height = t.size(1);
+    int width = t.size(2);
+    int top = 0;
+    int ci = 0;
+    int cj = 0;
+    if (dim == 0) {
+        ci = (pos[1]/2+0.5)*height;
+        cj = (pos[0]/2+0.5)*width;
+    }
+    else if (dim == 1) {
+        ci = (pos[1]/2+0.5)*height;
+        cj = (pos[2]/2+0.5)*width;
+    }
+    else {
+        ci = (-pos[2]/2+0.5)*height;
+        cj = (pos[0]/2+0.5)*width;
+    }
+    int stride = grid.stride;
+    for (int ni = ci + stride; ni >= ci - stride; ni -= stride) {
+        for (int nj = cj - stride; nj <= cj + stride; nj += stride) {
+            int idx = -1;
+            if (ni == ci && nj == cj)
+                idx = seq;
+            else if (!(ni < 0 || ni >= height || nj < 0 || nj >= width)) {
+                if (boundary_info == -1)
+                    idx = fetch_seq(grid, 0, ni, nj, t);
+                else if (boundary_info == 1)
+                    idx = fetch_last_seq(grid, ni, nj, t);
+                else
+                    idx = fetch_nearest_seq(grid, ni, nj, dim, pos[(dim + 2) % 3], t);
+            }
+            output_indices[top] = idx;
+            top += 1;
+        }
+    }
+}
+
+void DownsampleGrid(Grid& src, Grid& tar)
+{
+    src.downsample_seq.resize(src.seq2grid.size(), -1);
+    tar.resolution = src.resolution / 2;
+    tar.stride = src.stride * 2;
+    float pos[3];
+    std::vector<int> seq2normal_count;
+    for (int i = 0; i < src.seq2grid.size(); ++i) {
+        key2pos(src.seq2grid[i], src.resolution, pos);
+        int k = pos2key(pos, tar.resolution);
+        int s = seq2normal_count.size();
+        if (!tar.grid2seq.count(k)) {
+            tar.grid2seq[k] = tar.seq2grid.size();
+            tar.seq2grid.emplace_back(k);
+            seq2normal_count.emplace_back(0);
+            seq2normal_count.emplace_back(0);
+            seq2normal_count.emplace_back(0);
+            //tar.seq2normal.emplace_back(src.seq2normal[i]);
+        } else {
+            s = tar.grid2seq[k] * 3;
+        }
+        seq2normal_count[s + src.seq2normal[i]] += 1;
+        src.downsample_seq[i] = tar.grid2seq[k];
+    }
+    tar.seq2normal.resize(seq2normal_count.size() / 3);
+    for (int i = 0; i < seq2normal_count.size(); i += 3) {
+        int t = 0;
+        for (int j = 1; j < 3; ++j) {
+            if (seq2normal_count[i + j] > seq2normal_count[i + t])
+                t = j;
+        }
+        tar.seq2normal[i / 3] = t;
+    }
+}
+
+void NeighborGrid(Grid& grid, std::vector<torch::Tensor> view_layer_positions, int v)
+{
+    grid.seq2evencorner.resize(grid.seq2grid.size(), 0);
+    grid.seq2oddcorner.resize(grid.seq2grid.size(), 0);
+    std::unordered_set<int> visited_seq;
+    for (int vd = 0; vd < 3; ++vd) {
+        auto t = view_layer_positions[vd];
+        auto t0 = view_layer_positions[v];
+        int height = t.size(1);
+        int width = t.size(2);
+        int num_layers = t.size(0);
+        int num_view_layers = t0.size(0);
+        for (int i = 0; i < height; ++i) {
+            for (int j = 0; j < width; ++j) {
+                for (int l = 0; l < num_layers; ++l) {
+                    int seq = fetch_seq(grid, l, i, j, t);
+                    if (seq == -1)
+                        break;
+                    int dim = grid.seq2normal[seq];
+                    if (dim != v)
+                        continue;
+
+                    float pos[3];
+                    pos_from_seq(grid, seq, pos);
+
+                    int ci = 0;
+                    int cj = 0;
+                    if (dim == 0) {
+                        ci = (pos[1]/2+0.5)*height;
+                        cj = (pos[0]/2+0.5)*width;
+                    }
+                    else if (dim == 1) {
+                        ci = (pos[1]/2+0.5)*height;
+                        cj = (pos[2]/2+0.5)*width;
+                    }
+                    else {
+                        ci = (-pos[2]/2+0.5)*height;
+                        cj = (pos[0]/2+0.5)*width;
+                    }
+
+                    if ((ci % (grid.stride * 2) < grid.stride) && (cj % (grid.stride * 2) >= grid.stride))
+                        grid.seq2evencorner[seq] = 1;
+
+                    if ((ci % (grid.stride * 2) >= grid.stride) && (cj % (grid.stride * 2) < grid.stride))
+                        grid.seq2oddcorner[seq] = 1;
+
+                    bool is_boundary = false;
+                    if (vd == v) {
+                        if (l == 0 || l == num_layers - 1)
+                            is_boundary = true;
+                        else {
+                            int seq_new = fetch_seq(grid, l + 1, i, j, t);
+                            if (seq_new == -1)
+                                is_boundary = true;
+                        }
+                    }
+                    int boundary_info = 0;
+                    if (is_boundary && (l == 0))
+                        boundary_info = -1;
+                    else if (is_boundary)
+                        boundary_info = 1;
+                    if (visited_seq.count(seq))
+                        continue;
+                    visited_seq.insert(seq);
+
+                    FetchNeighbor(grid, seq, pos, dim, boundary_info, view_layer_positions, &grid.seq2neighbor[seq * 9]);
+                }
+            }
+        }
+    }
+}
+
+void PadGrid(Grid& src, Grid& tar, std::vector<torch::Tensor>& view_layer_positions) {
+    auto& downsample_seq = src.downsample_seq;
+    auto& seq2evencorner = src.seq2evencorner;
+    auto& seq2oddcorner = src.seq2oddcorner;
+    int indices[9];
+    std::vector<int> mapped_even_corners(tar.seq2grid.size(), 0);
+    std::vector<int> mapped_odd_corners(tar.seq2grid.size(), 0);
+    for (int i = 0; i < downsample_seq.size(); ++i) {
+        if (seq2evencorner[i] > 0) {
+            mapped_even_corners[downsample_seq[i]] = 1;
+        }
+        if (seq2oddcorner[i] > 0) {
+            mapped_odd_corners[downsample_seq[i]] = 1;
+        }
+    }
+    auto& tar_seq2normal = tar.seq2normal;
+    auto& tar_seq2grid = tar.seq2grid;
+    for (int i = 0; i < tar_seq2grid.size(); ++i) {
+        if (mapped_even_corners[i] == 1 && mapped_odd_corners[i] == 1)
+            continue;
+        auto k = tar_seq2grid[i];
+        float p[3];
+        key2cornerpos(k, tar.resolution, p);
+
+        int src_key = pos2key(p, src.resolution);
+        if (!src.grid2seq.count(src_key)) {
+            int seq = src.seq2grid.size();
+            src.grid2seq[src_key] = seq;
+            src.seq2evencorner.emplace_back((mapped_even_corners[i] == 0));
+            src.seq2oddcorner.emplace_back((mapped_odd_corners[i] == 0));
+            src.seq2grid.emplace_back(src_key);
+            src.seq2normal.emplace_back(tar_seq2normal[i]);
+            FetchNeighbor(src, seq, p, tar_seq2normal[i], 0, view_layer_positions, indices);
+            for (int j = 0; j < 9; ++j) {
+                src.seq2neighbor.emplace_back(indices[j]);
+            }
+            src.downsample_seq.emplace_back(i);
+        } else {
+            int seq = src.grid2seq[src_key];
+            if (mapped_even_corners[i] == 0)
+                src.seq2evencorner[seq] = 1;
+            if (mapped_odd_corners[i] == 0)
+                src.seq2oddcorner[seq] = 1;
+        }
+    }
+}
+
+std::vector<std::vector<torch::Tensor>> build_hierarchy(std::vector<torch::Tensor> view_layer_positions,
+    std::vector<torch::Tensor> view_layer_normals, int num_level, int resolution)
+{
+    if (view_layer_positions.size() != 3 || num_level < 1) {
+        printf("Alert! We require 3 layers and at least 1 level! (%d %d)\n", view_layer_positions.size(), num_level);
+        return {{},{},{},{}};
+    }
+
+    std::vector<Grid> grids;
+    grids.resize(num_level);
+
+    std::vector<float> seq2pos;
+    auto& seq2grid = grids[0].seq2grid;
+    auto& seq2normal = grids[0].seq2normal;
+    auto& grid2seq = grids[0].grid2seq;
+    grids[0].resolution = resolution;
+    grids[0].stride = 1;
+
+    auto int64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false);
+    auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false);
+
+    for (int v = 0; v < 3; ++v) {
+        int num_layers = view_layer_positions[v].size(0);
+        int height = view_layer_positions[v].size(1);
+        int width = view_layer_positions[v].size(2);
+        float* data = view_layer_positions[v].data_ptr<float>();
+        float* data_normal = view_layer_normals[v].data_ptr<float>();
+        for (int l = 0; l < num_layers; ++l) {
+            for (int i = 0; i < height; ++i) {
+                for (int j = 0; j < width; ++j) {
+                    float* p = &data[(i * width + j) * 4];
+                    float* n = &data_normal[(i * width + j) * 3];
+                    if (p[3] == 0)
+                        continue;
+                    auto k = pos2key(p, resolution);
+                    if (!grid2seq.count(k)) {
+                        int dim = 0;
+                        for (int d = 0; d < 3; ++d) {
+                            if (std::abs(n[d]) > std::abs(n[dim]))
+                                dim = d;
+                        }
+                        dim = (dim + 1) % 3;
+                        grid2seq[k] = seq2grid.size();
+                        seq2grid.emplace_back(k);
+                        seq2pos.push_back(p[0]);
+                        seq2pos.push_back(p[1]);
+                        seq2pos.push_back(p[2]);
+                        seq2normal.emplace_back(dim);
+                    }
+                }
+            }
+            data += (height * width * 4);
+            data_normal += (height * width * 3);
+        }
+    }
+
+    for (int i = 0; i < num_level - 1; ++i) {
+        DownsampleGrid(grids[i], grids[i + 1]);
+    }
+
+    for (int l = 0; l < num_level; ++l) {
+        grids[l].seq2neighbor.resize(grids[l].seq2grid.size() * 9, -1);
+        grids[l].num_origin_seq = grids[l].seq2grid.size();
+        for (int d = 0; d < 3; ++d) {
+            NeighborGrid(grids[l], view_layer_positions, d);
+        }
+    }
+
+    for (int i = num_level - 2; i >= 0; --i) {
+        PadGrid(grids[i], grids[i + 1], view_layer_positions);
+    }
+    for (int i = grids[0].num_origin_seq; i < grids[0].seq2grid.size(); ++i) {
+        int k = grids[0].seq2grid[i];
+        float p[3];
+        key2pos(k, grids[0].resolution, p);
+        seq2pos.push_back(p[0]);
+        seq2pos.push_back(p[1]);
+        seq2pos.push_back(p[2]);
+    }
+
+    std::vector<torch::Tensor> texture_positions(2);
+    std::vector<torch::Tensor> grid_neighbors(grids.size());
+    std::vector<torch::Tensor> grid_downsamples(grids.size() - 1);
+    std::vector<torch::Tensor> grid_evencorners(grids.size());
+    std::vector<torch::Tensor> grid_oddcorners(grids.size());
+
+    texture_positions[0] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3), static_cast<int64_t>(3)}, float_options);
+    texture_positions[1] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3)}, float_options);
+    float* positions_out_ptr = texture_positions[0].data_ptr<float>();
+    memcpy(positions_out_ptr, seq2pos.data(), sizeof(float) * seq2pos.size());
+    positions_out_ptr = texture_positions[1].data_ptr<float>();
+    for (int i = 0; i < grids[0].seq2grid.size(); ++i) {
+        positions_out_ptr[i] = (i < grids[0].num_origin_seq);
+    }
+
+    for (int i = 0; i < grids.size(); ++i) {
+        grid_neighbors[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2grid.size()), static_cast<int64_t>(9)}, int64_options);
+        int64_t* nptr = grid_neighbors[i].data_ptr<int64_t>();
+        for (int j = 0; j < grids[i].seq2neighbor.size(); ++j) {
+            nptr[j] = grids[i].seq2neighbor[j];
+        }
+
+        grid_evencorners[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2evencorner.size())}, int64_options);
+        grid_oddcorners[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2oddcorner.size())}, int64_options);
+        int64_t* dptr = grid_evencorners[i].data_ptr<int64_t>();
+        for (int j = 0; j < grids[i].seq2evencorner.size(); ++j) {
+            dptr[j] = grids[i].seq2evencorner[j];
+        }
+        dptr = grid_oddcorners[i].data_ptr<int64_t>();
+        for (int j = 0; j < grids[i].seq2oddcorner.size(); ++j) {
+            dptr[j] = grids[i].seq2oddcorner[j];
+        }            
+        if (i + 1 < grids.size()) {
+            grid_downsamples[i] = torch::zeros({static_cast<int64_t>(grids[i].downsample_seq.size())}, int64_options);
+            int64_t* dptr = grid_downsamples[i].data_ptr<int64_t>();
+            for (int j = 0; j < grids[i].downsample_seq.size(); ++j) {
+                dptr[j] = grids[i].downsample_seq[j];
+            }
+        }
+
+    }
+    return {texture_positions, grid_neighbors, grid_downsamples, grid_evencorners, grid_oddcorners};
+}
+
+std::vector<std::vector<torch::Tensor>> build_hierarchy_with_feat(
+    std::vector<torch::Tensor> view_layer_positions,
+    std::vector<torch::Tensor> view_layer_normals,
+    std::vector<torch::Tensor> view_layer_feats,
+    int num_level, int resolution)
+{
+    if (view_layer_positions.size() != 3 || num_level < 1) {
+        printf("Alert! We require 3 layers and at least 1 level! (%d %d)\n", view_layer_positions.size(), num_level);
+        return {{},{},{},{}};
+    }
+
+    std::vector<Grid> grids;
+    grids.resize(num_level);
+
+    std::vector<float> seq2pos;
+    std::vector<float> seq2feat;
+    auto& seq2grid = grids[0].seq2grid;
+    auto& seq2normal = grids[0].seq2normal;
+    auto& grid2seq = grids[0].grid2seq;
+    grids[0].resolution = resolution;
+    grids[0].stride = 1;
+
+    auto int64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false);
+    auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false);
+
+    int feat_channel = 3;
+    for (int v = 0; v < 3; ++v) {
+        int num_layers = view_layer_positions[v].size(0);
+        int height = view_layer_positions[v].size(1);
+        int width = view_layer_positions[v].size(2);
+        float* data = view_layer_positions[v].data_ptr<float>();
+        float* data_normal = view_layer_normals[v].data_ptr<float>();
+        float* data_feat = view_layer_feats[v].data_ptr<float>();
+        feat_channel = view_layer_feats[v].size(3);
+        for (int l = 0; l < num_layers; ++l) {
+            for (int i = 0; i < height; ++i) {
+                for (int j = 0; j < width; ++j) {
+                    float* p = &data[(i * width + j) * 4];
+                    float* n = &data_normal[(i * width + j) * 3];
+                    float* f = &data_feat[(i * width + j) * feat_channel];
+                    if (p[3] == 0)
+                        continue;
+                    auto k = pos2key(p, resolution);
+                    if (!grid2seq.count(k)) {
+                        int dim = 0;
+                        for (int d = 0; d < 3; ++d) {
+                            if (std::abs(n[d]) > std::abs(n[dim]))
+                                dim = d;
+                        }
+                        dim = (dim + 1) % 3;
+                        grid2seq[k] = seq2grid.size();
+                        seq2grid.emplace_back(k);
+                        seq2pos.push_back(p[0]);
+                        seq2pos.push_back(p[1]);
+                        seq2pos.push_back(p[2]);
+                        for (int c = 0; c < feat_channel; ++c) {
+                            seq2feat.emplace_back(f[c]);
+                        }
+                        seq2normal.emplace_back(dim);
+                    }
+                }
+            }
+            data += (height * width * 4);
+            data_normal += (height * width * 3);
+            data_feat += (height * width * feat_channel);
+        }
+    }
+
+    for (int i = 0; i < num_level - 1; ++i) {
+        DownsampleGrid(grids[i], grids[i + 1]);
+    }
+
+    for (int l = 0; l < num_level; ++l) {
+        grids[l].seq2neighbor.resize(grids[l].seq2grid.size() * 9, -1);
+        grids[l].num_origin_seq = grids[l].seq2grid.size();
+        for (int d = 0; d < 3; ++d) {
+            NeighborGrid(grids[l], view_layer_positions, d);
+        }
+    }
+
+    for (int i = num_level - 2; i >= 0; --i) {
+        PadGrid(grids[i], grids[i + 1], view_layer_positions);
+    }
+    for (int i = grids[0].num_origin_seq; i < grids[0].seq2grid.size(); ++i) {
+        int k = grids[0].seq2grid[i];
+        float p[3];
+        key2pos(k, grids[0].resolution, p);
+        seq2pos.push_back(p[0]);
+        seq2pos.push_back(p[1]);
+        seq2pos.push_back(p[2]);
+        for (int c = 0; c < feat_channel; ++c) {
+            seq2feat.emplace_back(0.5);
+        }
+    }
+
+    std::vector<torch::Tensor> texture_positions(2);
+    std::vector<torch::Tensor> texture_feats(1);
+    std::vector<torch::Tensor> grid_neighbors(grids.size());
+    std::vector<torch::Tensor> grid_downsamples(grids.size() - 1);
+    std::vector<torch::Tensor> grid_evencorners(grids.size());
+    std::vector<torch::Tensor> grid_oddcorners(grids.size());
+
+    texture_positions[0] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3), static_cast<int64_t>(3)}, float_options);
+    texture_positions[1] = torch::zeros({static_cast<int64_t>(seq2pos.size() / 3)}, float_options);
+    texture_feats[0] = torch::zeros({static_cast<int64_t>(seq2feat.size() / feat_channel), static_cast<int64_t>(feat_channel)}, float_options);
+    float* positions_out_ptr = texture_positions[0].data_ptr<float>();
+    memcpy(positions_out_ptr, seq2pos.data(), sizeof(float) * seq2pos.size());
+    positions_out_ptr = texture_positions[1].data_ptr<float>();
+    for (int i = 0; i < grids[0].seq2grid.size(); ++i) {
+        positions_out_ptr[i] = (i < grids[0].num_origin_seq);
+    }
+    float* feats_out_ptr = texture_feats[0].data_ptr<float>();
+    memcpy(feats_out_ptr, seq2feat.data(), sizeof(float) * seq2feat.size());
+
+    for (int i = 0; i < grids.size(); ++i) {
+        grid_neighbors[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2grid.size()), static_cast<int64_t>(9)}, int64_options);
+        int64_t* nptr = grid_neighbors[i].data_ptr<int64_t>();
+        for (int j = 0; j < grids[i].seq2neighbor.size(); ++j) {
+            nptr[j] = grids[i].seq2neighbor[j];
+        }
+        grid_evencorners[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2evencorner.size())}, int64_options);
+        grid_oddcorners[i] = torch::zeros({static_cast<int64_t>(grids[i].seq2oddcorner.size())}, int64_options);
+        int64_t* dptr = grid_evencorners[i].data_ptr<int64_t>();
+        for (int j = 0; j < grids[i].seq2evencorner.size(); ++j) {
+            dptr[j] = grids[i].seq2evencorner[j];
+        }
+        dptr = grid_oddcorners[i].data_ptr<int64_t>();
+        for (int j = 0; j < grids[i].seq2oddcorner.size(); ++j) {
+            dptr[j] = grids[i].seq2oddcorner[j];
+        }
+        if (i + 1 < grids.size()) {
+            grid_downsamples[i] = torch::zeros({static_cast<int64_t>(grids[i].downsample_seq.size())}, int64_options);
+            int64_t* dptr = grid_downsamples[i].data_ptr<int64_t>();
+            for (int j = 0; j < grids[i].downsample_seq.size(); ++j) {
+                dptr[j] = grids[i].downsample_seq[j];
+            }
+        }
+    }
+    return {texture_positions, texture_feats, grid_neighbors, grid_downsamples, grid_evencorners, grid_oddcorners};
+}
diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp
new file mode 100644
index 0000000..4af6eeb
--- /dev/null
+++ b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp
@@ -0,0 +1,139 @@
+#include "rasterizer.h"
+
+void rasterizeTriangleCPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, INT64* zbuffer, float* d, float occlusion_truncation) {
+    float x_min = std::min(vt0[0], std::min(vt1[0],vt2[0]));
+    float x_max = std::max(vt0[0], std::max(vt1[0],vt2[0]));
+    float y_min = std::min(vt0[1], std::min(vt1[1],vt2[1]));
+    float y_max = std::max(vt0[1], std::max(vt1[1],vt2[1]));
+
+    for (int px = x_min; px < x_max + 1; ++px) {
+        if (px < 0 || px >= width)
+            continue;
+        for (int py = y_min; py < y_max + 1; ++py) {
+            if (py < 0 || py >= height)
+                continue;
+            float vt[2] = {px + 0.5, py + 0.5};
+            float baryCentricCoordinate[3];
+            calculateBarycentricCoordinate(vt0, vt1, vt2, vt, baryCentricCoordinate);
+            if (isBarycentricCoordInBounds(baryCentricCoordinate)) {
+                int pixel = py * width + px;
+                if (zbuffer == 0) {
+                    zbuffer[pixel] = (INT64)(idx + 1);
+                    continue;
+                }
+
+                float depth = baryCentricCoordinate[0] * vt0[2] + baryCentricCoordinate[1] * vt1[2] + baryCentricCoordinate[2] * vt2[2];
+                float depth_thres = 0;
+                if (d) {
+                    depth_thres = d[pixel] * 0.49999f + 0.5f + occlusion_truncation;
+                }
+                
+                int z_quantize = depth * (2<<17);
+                INT64 token = (INT64)z_quantize * MAXINT + (INT64)(idx + 1);
+                if (depth < depth_thres)
+                    continue;
+                zbuffer[pixel] = std::min(zbuffer[pixel], token);
+            }
+        }
+    }
+}
+
+void barycentricFromImgcoordCPU(float* V, int* F, int* findices, INT64* zbuffer, int width, int height, int num_vertices, int num_faces,
+    float* barycentric_map, int pix)
+{
+    INT64 f = zbuffer[pix] % MAXINT;
+    if (f == (MAXINT-1)) {
+        findices[pix] = 0;
+        barycentric_map[pix * 3] = 0;
+        barycentric_map[pix * 3 + 1] = 0;
+        barycentric_map[pix * 3 + 2] = 0;
+        return;
+    }
+    findices[pix] = f;
+    f -= 1;
+    float barycentric[3] = {0, 0, 0};
+    if (f >= 0) {
+        float vt[2] = {float(pix % width) + 0.5f, float(pix / width) + 0.5f};
+        float* vt0_ptr = V + (F[f * 3] * 4);
+        float* vt1_ptr = V + (F[f * 3 + 1] * 4);
+        float* vt2_ptr = V + (F[f * 3 + 2] * 4);
+
+        float vt0[2] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f};
+        float vt1[2] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f};
+        float vt2[2] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f};
+
+        calculateBarycentricCoordinate(vt0, vt1, vt2, vt, barycentric);
+
+        barycentric[0] = barycentric[0] / vt0_ptr[3];
+        barycentric[1] = barycentric[1] / vt1_ptr[3];
+        barycentric[2] = barycentric[2] / vt2_ptr[3];
+        float w = 1.0f / (barycentric[0] + barycentric[1] + barycentric[2]);
+        barycentric[0] *= w;
+        barycentric[1] *= w;
+        barycentric[2] *= w;
+
+    }
+    barycentric_map[pix * 3] = barycentric[0];
+    barycentric_map[pix * 3 + 1] = barycentric[1];
+    barycentric_map[pix * 3 + 2] = barycentric[2];
+}
+
+void rasterizeImagecoordsKernelCPU(float* V, int* F, float* d, INT64* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces, int f)
+{
+    float* vt0_ptr = V + (F[f * 3] * 4);
+    float* vt1_ptr = V + (F[f * 3 + 1] * 4);
+    float* vt2_ptr = V + (F[f * 3 + 2] * 4);
+
+    float vt0[3] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f, vt0_ptr[2] / vt0_ptr[3] * 0.49999f + 0.5f};
+    float vt1[3] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f, vt1_ptr[2] / vt1_ptr[3] * 0.49999f + 0.5f};
+    float vt2[3] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f, vt2_ptr[2] / vt2_ptr[3] * 0.49999f + 0.5f};
+
+    rasterizeTriangleCPU(f, vt0, vt1, vt2, width, height, zbuffer, d, occlusion_trunc);
+}
+
+std::vector<torch::Tensor> rasterize_image_cpu(torch::Tensor V, torch::Tensor F, torch::Tensor D,
+    int width, int height, float occlusion_truncation, int use_depth_prior)
+{
+    int num_faces = F.size(0);
+    int num_vertices = V.size(0);
+    auto options = torch::TensorOptions().dtype(torch::kInt32).requires_grad(false);
+    auto INT64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false);
+    auto findices = torch::zeros({height, width}, options);
+    INT64 maxint = (INT64)MAXINT * (INT64)MAXINT + (MAXINT - 1);
+    auto z_min = torch::ones({height, width}, INT64_options) * (int64_t)maxint;
+
+    if (!use_depth_prior) {
+        for (int i = 0; i < num_faces; ++i) {
+            rasterizeImagecoordsKernelCPU(V.data_ptr<float>(), F.data_ptr<int>(), 0,
+                (INT64*)z_min.data_ptr<int64_t>(), occlusion_truncation, width, height, num_vertices, num_faces, i); 
+        }
+    } else {
+        for (int i = 0; i < num_faces; ++i)
+            rasterizeImagecoordsKernelCPU(V.data_ptr<float>(), F.data_ptr<int>(), D.data_ptr<float>(),
+                (INT64*)z_min.data_ptr<int64_t>(), occlusion_truncation, width, height, num_vertices, num_faces, i);
+    }
+
+    auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false);
+    auto barycentric = torch::zeros({height, width, 3}, float_options);
+    for (int i = 0; i < width * height; ++i)
+        barycentricFromImgcoordCPU(V.data_ptr<float>(), F.data_ptr<int>(),
+            findices.data_ptr<int>(), (INT64*)z_min.data_ptr<int64_t>(), width, height, num_vertices, num_faces, barycentric.data_ptr<float>(), i);
+
+    return {findices, barycentric};
+}
+
+std::vector<torch::Tensor> rasterize_image(torch::Tensor V, torch::Tensor F, torch::Tensor D,
+    int width, int height, float occlusion_truncation, int use_depth_prior)
+{
+    int device_id = V.get_device();
+    if (device_id == -1)
+        return rasterize_image_cpu(V, F, D, width, height, occlusion_truncation, use_depth_prior);
+    else
+        return rasterize_image_gpu(V, F, D, width, height, occlusion_truncation, use_depth_prior);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("rasterize_image", &rasterize_image, "Custom image rasterization");
+  m.def("build_hierarchy", &build_hierarchy, "Custom image rasterization");
+  m.def("build_hierarchy_with_feat", &build_hierarchy_with_feat, "Custom image rasterization");
+}
diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h
new file mode 100644
index 0000000..cf4f987
--- /dev/null
+++ b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h
@@ -0,0 +1,54 @@
+#ifndef RASTERIZER_H_
+#define RASTERIZER_H_
+
+#include <torch/extension.h>
+#include <vector>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h> // For CUDA context
+
+#define INT64 unsigned long long
+#define MAXINT 2147483647
+
+__host__ __device__ inline float calculateSignedArea2(float* a, float* b, float* c) {
+    return ((c[0] - a[0]) * (b[1] - a[1]) - (b[0] - a[0]) * (c[1] - a[1]));
+}
+
+__host__ __device__  inline void calculateBarycentricCoordinate(float* a, float* b, float* c, float* p,
+    float* barycentric)
+{
+    float beta_tri = calculateSignedArea2(a, p, c);
+    float gamma_tri = calculateSignedArea2(a, b, p);
+    float area = calculateSignedArea2(a, b, c);
+    if (area == 0) {
+        barycentric[0] = -1.0;
+        barycentric[1] = -1.0;
+        barycentric[2] = -1.0;
+        return;
+    }
+    float tri_inv = 1.0 / area;
+    float beta = beta_tri * tri_inv;
+    float gamma = gamma_tri * tri_inv;
+    float alpha = 1.0 - beta - gamma;
+    barycentric[0] = alpha;
+    barycentric[1] = beta;
+    barycentric[2] = gamma;
+}
+
+__host__ __device__  inline bool isBarycentricCoordInBounds(float* barycentricCoord) {
+    return barycentricCoord[0] >= 0.0 && barycentricCoord[0] <= 1.0 &&
+           barycentricCoord[1] >= 0.0 && barycentricCoord[1] <= 1.0 &&
+           barycentricCoord[2] >= 0.0 && barycentricCoord[2] <= 1.0;
+}
+
+std::vector<torch::Tensor> rasterize_image_gpu(torch::Tensor V, torch::Tensor F, torch::Tensor D,
+    int width, int height, float occlusion_truncation, int use_depth_prior);
+
+std::vector<std::vector<torch::Tensor>> build_hierarchy(std::vector<torch::Tensor> view_layer_positions, std::vector<torch::Tensor> view_layer_normals, int num_level, int resolution);
+
+std::vector<std::vector<torch::Tensor>> build_hierarchy_with_feat(
+    std::vector<torch::Tensor> view_layer_positions,
+    std::vector<torch::Tensor> view_layer_normals,
+    std::vector<torch::Tensor> view_layer_feats,
+    int num_level, int resolution);
+
+#endif
\ No newline at end of file
diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu
new file mode 100644
index 0000000..cc6f354
--- /dev/null
+++ b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu
@@ -0,0 +1,127 @@
+#include "rasterizer.h"
+
+__device__ void rasterizeTriangleGPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, INT64* zbuffer, float* d, float occlusion_truncation) {
+    float x_min = std::min(vt0[0], std::min(vt1[0],vt2[0]));
+    float x_max = std::max(vt0[0], std::max(vt1[0],vt2[0]));
+    float y_min = std::min(vt0[1], std::min(vt1[1],vt2[1]));
+    float y_max = std::max(vt0[1], std::max(vt1[1],vt2[1]));
+
+    for (int px = x_min; px < x_max + 1; ++px) {
+        if (px < 0 || px >= width)
+            continue;
+        for (int py = y_min; py < y_max + 1; ++py) {
+            if (py < 0 || py >= height)
+                continue;
+            float vt[2] = {px + 0.5f, py + 0.5f};
+            float baryCentricCoordinate[3];
+            calculateBarycentricCoordinate(vt0, vt1, vt2, vt, baryCentricCoordinate);
+            if (isBarycentricCoordInBounds(baryCentricCoordinate)) {
+                int pixel = py * width + px;
+                if (zbuffer == 0) {
+                    atomicExch(&zbuffer[pixel], (INT64)(idx + 1));
+                    continue;
+                }
+                float depth = baryCentricCoordinate[0] * vt0[2] + baryCentricCoordinate[1] * vt1[2] + baryCentricCoordinate[2] * vt2[2];
+                float depth_thres = 0;
+                if (d) {
+                    depth_thres = d[pixel] * 0.49999f + 0.5f + occlusion_truncation;
+                }
+                
+                int z_quantize = depth * (2<<17);
+                INT64 token = (INT64)z_quantize * MAXINT + (INT64)(idx + 1);
+                if (depth < depth_thres)
+                    continue;
+                atomicMin(&zbuffer[pixel], token);
+            }
+        }
+    }
+}
+
+__global__ void barycentricFromImgcoordGPU(float* V, int* F, int* findices, INT64* zbuffer, int width, int height, int num_vertices, int num_faces,
+    float* barycentric_map)
+{
+    int pix = blockIdx.x * blockDim.x + threadIdx.x;
+    if (pix >= width * height)
+        return;
+    INT64 f = zbuffer[pix] % MAXINT;
+    if (f == (MAXINT-1)) {
+        findices[pix] = 0;
+        barycentric_map[pix * 3] = 0;
+        barycentric_map[pix * 3 + 1] = 0;
+        barycentric_map[pix * 3 + 2] = 0;
+        return;
+    }
+    findices[pix] = f;
+    f -= 1;
+    float barycentric[3] = {0, 0, 0};
+    if (f >= 0) {
+        float vt[2] = {float(pix % width) + 0.5f, float(pix / width) + 0.5f};
+        float* vt0_ptr = V + (F[f * 3] * 4);
+        float* vt1_ptr = V + (F[f * 3 + 1] * 4);
+        float* vt2_ptr = V + (F[f * 3 + 2] * 4);
+
+        float vt0[2] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f};
+        float vt1[2] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f};
+        float vt2[2] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f};
+
+        calculateBarycentricCoordinate(vt0, vt1, vt2, vt, barycentric);
+
+        barycentric[0] = barycentric[0] / vt0_ptr[3];
+        barycentric[1] = barycentric[1] / vt1_ptr[3];
+        barycentric[2] = barycentric[2] / vt2_ptr[3];
+        float w = 1.0f / (barycentric[0] + barycentric[1] + barycentric[2]);
+        barycentric[0] *= w;
+        barycentric[1] *= w;
+        barycentric[2] *= w;
+
+    }
+    barycentric_map[pix * 3] = barycentric[0];
+    barycentric_map[pix * 3 + 1] = barycentric[1];
+    barycentric_map[pix * 3 + 2] = barycentric[2];
+}
+
+__global__ void rasterizeImagecoordsKernelGPU(float* V, int* F, float* d, INT64* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces)
+{
+    int f = blockIdx.x * blockDim.x + threadIdx.x;
+    if (f >= num_faces)
+        return; 
+
+    float* vt0_ptr = V + (F[f * 3] * 4);
+    float* vt1_ptr = V + (F[f * 3 + 1] * 4);
+    float* vt2_ptr = V + (F[f * 3 + 2] * 4);
+
+    float vt0[3] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f, vt0_ptr[2] / vt0_ptr[3] * 0.49999f + 0.5f};
+    float vt1[3] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f, vt1_ptr[2] / vt1_ptr[3] * 0.49999f + 0.5f};
+    float vt2[3] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f, vt2_ptr[2] / vt2_ptr[3] * 0.49999f + 0.5f};
+
+    rasterizeTriangleGPU(f, vt0, vt1, vt2, width, height, zbuffer, d, occlusion_trunc);
+}
+
+std::vector<torch::Tensor> rasterize_image_gpu(torch::Tensor V, torch::Tensor F, torch::Tensor D,
+    int width, int height, float occlusion_truncation, int use_depth_prior)
+{
+    int device_id = V.get_device();
+    cudaSetDevice(device_id);
+    int num_faces = F.size(0);
+    int num_vertices = V.size(0);
+    auto options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA, device_id).requires_grad(false);
+    auto INT64_options = torch::TensorOptions().dtype(torch::kInt64).device(torch::kCUDA, device_id).requires_grad(false);
+    auto findices = torch::zeros({height, width}, options);
+    INT64 maxint = (INT64)MAXINT * (INT64)MAXINT + (MAXINT - 1);
+    auto z_min = torch::ones({height, width}, INT64_options) * (int64_t)maxint;
+
+    if (!use_depth_prior) {
+        rasterizeImagecoordsKernelGPU<<<(num_faces+255)/256,256,0,at::cuda::getCurrentCUDAStream()>>>(V.data_ptr<float>(), F.data_ptr<int>(), 0,
+            (INT64*)z_min.data_ptr<int64_t>(), occlusion_truncation, width, height, num_vertices, num_faces); 
+    } else {
+        rasterizeImagecoordsKernelGPU<<<(num_faces+255)/256,256,0,at::cuda::getCurrentCUDAStream()>>>(V.data_ptr<float>(), F.data_ptr<int>(), D.data_ptr<float>(),
+            (INT64*)z_min.data_ptr<int64_t>(), occlusion_truncation, width, height, num_vertices, num_faces); 
+    }
+
+    auto float_options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA, device_id).requires_grad(false);
+    auto barycentric = torch::zeros({height, width, 3}, float_options);
+    barycentricFromImgcoordGPU<<<(width * height + 255)/256, 256>>>(V.data_ptr<float>(), F.data_ptr<int>(),
+        findices.data_ptr<int>(), (INT64*)z_min.data_ptr<int64_t>(), width, height, num_vertices, num_faces, barycentric.data_ptr<float>());
+
+    return {findices, barycentric};
+}
diff --git a/hy3dgen/texgen/custom_rasterizer/setup.py b/hy3dgen/texgen/custom_rasterizer/setup.py
new file mode 100644
index 0000000..3e312a7
--- /dev/null
+++ b/hy3dgen/texgen/custom_rasterizer/setup.py
@@ -0,0 +1,26 @@
+from setuptools import setup, find_packages
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+# build custom rasterizer
+# build with `python setup.py install`
+# nvcc is needed
+
+custom_rasterizer_module = CUDAExtension('custom_rasterizer_kernel', [
+    'lib/custom_rasterizer_kernel/rasterizer.cpp',
+    'lib/custom_rasterizer_kernel/grid_neighbor.cpp',
+    'lib/custom_rasterizer_kernel/rasterizer_gpu.cu',
+])
+
+setup(
+    packages=find_packages(),
+    version='0.1',
+    name='custom_rasterizer',
+    include_package_data=True,
+    package_dir={'': '.'},
+    ext_modules=[
+        custom_rasterizer_module,
+    ],
+    cmdclass={
+        'build_ext': BuildExtension
+    }
+)
diff --git a/hy3dgen/texgen/differentiable_renderer/__init__.py b/hy3dgen/texgen/differentiable_renderer/__init__.py
new file mode 100644
index 0000000..8bb2bf8
--- /dev/null
+++ b/hy3dgen/texgen/differentiable_renderer/__init__.py
@@ -0,0 +1,13 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
\ No newline at end of file
diff --git a/hy3dgen/texgen/differentiable_renderer/camera_utils.py b/hy3dgen/texgen/differentiable_renderer/camera_utils.py
new file mode 100644
index 0000000..b67727c
--- /dev/null
+++ b/hy3dgen/texgen/differentiable_renderer/camera_utils.py
@@ -0,0 +1,106 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import math
+
+import numpy as np
+import torch
+
+
+def transform_pos(mtx, pos, keepdim=False):
+    t_mtx = torch.from_numpy(mtx).to(
+        pos.device) if isinstance(
+        mtx, np.ndarray) else mtx
+    if pos.shape[-1] == 3:
+        posw = torch.cat(
+            [pos, torch.ones([pos.shape[0], 1]).to(pos.device)], axis=1)
+    else:
+        posw = pos
+
+    if keepdim:
+        return torch.matmul(posw, t_mtx.t())[...]
+    else:
+        return torch.matmul(posw, t_mtx.t())[None, ...]
+
+
+def get_mv_matrix(elev, azim, camera_distance, center=None):
+    elev = -elev
+    azim += 90
+
+    elev_rad = math.radians(elev)
+    azim_rad = math.radians(azim)
+
+    camera_position = np.array([camera_distance * math.cos(elev_rad) * math.cos(azim_rad),
+                                camera_distance *
+                                math.cos(elev_rad) * math.sin(azim_rad),
+                                camera_distance * math.sin(elev_rad)])
+
+    if center is None:
+        center = np.array([0, 0, 0])
+    else:
+        center = np.array(center)
+
+    lookat = center - camera_position
+    lookat = lookat / np.linalg.norm(lookat)
+
+    up = np.array([0, 0, 1.0])
+    right = np.cross(lookat, up)
+    right = right / np.linalg.norm(right)
+    up = np.cross(right, lookat)
+    up = up / np.linalg.norm(up)
+
+    c2w = np.concatenate(
+        [np.stack([right, up, -lookat], axis=-1), camera_position[:, None]], axis=-1)
+
+    w2c = np.zeros((4, 4))
+    w2c[:3, :3] = np.transpose(c2w[:3, :3], (1, 0))
+    w2c[:3, 3:] = -np.matmul(np.transpose(c2w[:3, :3], (1, 0)), c2w[:3, 3:])
+    w2c[3, 3] = 1.0
+
+    return w2c.astype(np.float32)
+
+
+def get_orthographic_projection_matrix(
+    left=-1, right=1, bottom=-1, top=1, near=0, far=2):
+    """
+    计算正交投影矩阵。
+
+    参数:
+        left (float): 投影区域左侧边界。
+        right (float): 投影区域右侧边界。
+        bottom (float): 投影区域底部边界。
+        top (float): 投影区域顶部边界。
+        near (float): 投影区域近裁剪面距离。
+        far (float): 投影区域远裁剪面距离。
+
+    返回:
+        numpy.ndarray: 正交投影矩阵。
+    """
+    ortho_matrix = np.eye(4, dtype=np.float32)
+    ortho_matrix[0, 0] = 2 / (right - left)
+    ortho_matrix[1, 1] = 2 / (top - bottom)
+    ortho_matrix[2, 2] = -2 / (far - near)
+    ortho_matrix[0, 3] = -(right + left) / (right - left)
+    ortho_matrix[1, 3] = -(top + bottom) / (top - bottom)
+    ortho_matrix[2, 3] = -(far + near) / (far - near)
+    return ortho_matrix
+
+
+def get_perspective_projection_matrix(fovy, aspect_wh, near, far):
+    fovy_rad = math.radians(fovy)
+    return np.array([[1.0 / (math.tan(fovy_rad / 2.0) * aspect_wh), 0, 0, 0],
+                     [0, 1.0 / math.tan(fovy_rad / 2.0), 0, 0],
+                     [0, 0, -(far + near) / (far - near), -
+                     2.0 * far * near / (far - near)],
+                     [0, 0, -1, 0]]).astype(np.float32)
diff --git a/hy3dgen/texgen/differentiable_renderer/compile_mesh_painter.bat b/hy3dgen/texgen/differentiable_renderer/compile_mesh_painter.bat
new file mode 100644
index 0000000..3947b0f
--- /dev/null
+++ b/hy3dgen/texgen/differentiable_renderer/compile_mesh_painter.bat
@@ -0,0 +1,3 @@
+FOR /F "tokens=*" %%i IN ('python -m pybind11 --includes') DO SET PYINCLUDES=%%i
+echo %PYINCLUDES%
+g++ -O3 -Wall -shared -std=c++11 -fPIC %PYINCLUDES% mesh_processor.cpp -o mesh_processor.pyd -lpython3.12
\ No newline at end of file
diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.cpp b/hy3dgen/texgen/differentiable_renderer/mesh_processor.cpp
new file mode 100644
index 0000000..ca8650f
--- /dev/null
+++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.cpp
@@ -0,0 +1,161 @@
+#include <vector>
+#include <queue>
+#include <cmath>
+#include <algorithm>
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+
+namespace py = pybind11;
+using namespace std;
+
+std::pair<py::array_t<float>,
+  py::array_t<uint8_t>>  meshVerticeInpaint_smooth(py::array_t<float> texture,
+py::array_t<uint8_t> mask,
+                 py::array_t<float> vtx_pos, py::array_t<float> vtx_uv, 
+                 py::array_t<int> pos_idx, py::array_t<int> uv_idx) {
+    auto texture_buf = texture.request();
+    auto mask_buf = mask.request();
+    auto vtx_pos_buf = vtx_pos.request();
+    auto vtx_uv_buf = vtx_uv.request();
+    auto pos_idx_buf = pos_idx.request();
+    auto uv_idx_buf = uv_idx.request();
+
+    int texture_height = texture_buf.shape[0];
+    int texture_width = texture_buf.shape[1];
+    int texture_channel = texture_buf.shape[2];
+    float* texture_ptr = static_cast<float*>(texture_buf.ptr);
+    uint8_t* mask_ptr = static_cast<uint8_t*>(mask_buf.ptr);
+
+    int vtx_num = vtx_pos_buf.shape[0];
+    float* vtx_pos_ptr = static_cast<float*>(vtx_pos_buf.ptr);
+    float* vtx_uv_ptr = static_cast<float*>(vtx_uv_buf.ptr);
+    int* pos_idx_ptr = static_cast<int*>(pos_idx_buf.ptr);
+    int* uv_idx_ptr = static_cast<int*>(uv_idx_buf.ptr);
+
+    vector<float> vtx_mask(vtx_num, 0.0f);
+    vector<vector<float>> vtx_color(vtx_num, vector<float>(texture_channel, 0.0f));
+    vector<int> uncolored_vtxs;
+
+    vector<vector<int>> G(vtx_num);
+
+    for (int i = 0; i < uv_idx_buf.shape[0]; ++i) {
+        for (int k = 0; k < 3; ++k) {
+            int vtx_uv_idx = uv_idx_ptr[i * 3 + k];
+            int vtx_idx = pos_idx_ptr[i * 3 + k];
+            int uv_v = round(vtx_uv_ptr[vtx_uv_idx * 2] * (texture_width - 1));
+            int uv_u = round((1.0 - vtx_uv_ptr[vtx_uv_idx * 2 + 1]) * (texture_height - 1));
+
+            if (mask_ptr[uv_u * texture_width + uv_v] > 0) {
+                vtx_mask[vtx_idx] = 1.0f;
+                for (int c = 0; c < texture_channel; ++c) {
+                    vtx_color[vtx_idx][c] = texture_ptr[(uv_u * texture_width + uv_v) * texture_channel + c];
+                }
+            }else{
+                uncolored_vtxs.push_back(vtx_idx);
+            }
+
+            G[pos_idx_ptr[i * 3 + k]].push_back(pos_idx_ptr[i * 3 + (k + 1) % 3]);
+        }
+    }
+
+    int smooth_count = 2;
+    int last_uncolored_vtx_count = 0;
+    while (smooth_count>0) {
+        int uncolored_vtx_count = 0;
+
+        for (int vtx_idx : uncolored_vtxs) {
+
+            vector<float> sum_color(texture_channel, 0.0f);
+            float total_weight = 0.0f;
+
+            array<float, 3> vtx_0 = {vtx_pos_ptr[vtx_idx * 3],
+vtx_pos_ptr[vtx_idx * 3 + 1], vtx_pos_ptr[vtx_idx * 3 + 2]};
+            for (int connected_idx : G[vtx_idx]) {
+                if (vtx_mask[connected_idx] > 0) {
+                    array<float, 3> vtx1 = {vtx_pos_ptr[connected_idx * 3],
+                    vtx_pos_ptr[connected_idx * 3 + 1], vtx_pos_ptr[connected_idx * 3 + 2]};
+                    float dist_weight = 1.0f / max(sqrt(pow(vtx_0[0] - vtx1[0], 2) + pow(vtx_0[1] - vtx1[1], 2) + \
+                     pow(vtx_0[2] - vtx1[2], 2)), 1E-4);
+                    dist_weight = dist_weight * dist_weight;
+                    for (int c = 0; c < texture_channel; ++c) {
+                        sum_color[c] += vtx_color[connected_idx][c] * dist_weight;
+                    }
+                    total_weight += dist_weight;
+                }
+            }
+
+            if (total_weight > 0.0f) {
+                for (int c = 0; c < texture_channel; ++c) {
+                    vtx_color[vtx_idx][c] = sum_color[c] / total_weight;
+                }
+                vtx_mask[vtx_idx] = 1.0f;
+            } else {
+                uncolored_vtx_count++;
+            }
+            
+        }
+
+        if(last_uncolored_vtx_count==uncolored_vtx_count){
+            smooth_count--;
+        }else{
+            smooth_count++;
+        }
+        last_uncolored_vtx_count = uncolored_vtx_count;
+    }
+
+    // Create new arrays for the output
+    py::array_t<float> new_texture(texture_buf.size);
+    py::array_t<uint8_t> new_mask(mask_buf.size);
+
+    auto new_texture_buf = new_texture.request();
+    auto new_mask_buf = new_mask.request();
+
+    float* new_texture_ptr = static_cast<float*>(new_texture_buf.ptr);
+    uint8_t* new_mask_ptr = static_cast<uint8_t*>(new_mask_buf.ptr);
+    // Copy original texture and mask to new arrays
+    std::copy(texture_ptr, texture_ptr + texture_buf.size, new_texture_ptr);
+    std::copy(mask_ptr, mask_ptr + mask_buf.size, new_mask_ptr);
+
+    for (int face_idx = 0; face_idx < uv_idx_buf.shape[0]; ++face_idx) {
+        for (int k = 0; k < 3; ++k) {
+            int vtx_uv_idx = uv_idx_ptr[face_idx * 3 + k];
+            int vtx_idx = pos_idx_ptr[face_idx * 3 + k];
+
+            if (vtx_mask[vtx_idx] == 1.0f) {
+                int uv_v = round(vtx_uv_ptr[vtx_uv_idx * 2] * (texture_width - 1));
+                int uv_u = round((1.0 - vtx_uv_ptr[vtx_uv_idx * 2 + 1]) * (texture_height - 1));
+
+                for (int c = 0; c < texture_channel; ++c) {
+                    new_texture_ptr[(uv_u * texture_width + uv_v) * texture_channel + c] = vtx_color[vtx_idx][c];
+                }
+                new_mask_ptr[uv_u * texture_width + uv_v] = 255;
+            }
+        }
+    }
+
+    // Reshape the new arrays to match the original texture and mask shapes
+    new_texture.resize({texture_height, texture_width, 3});
+    new_mask.resize({texture_height, texture_width});
+  return std::make_pair(new_texture, new_mask);
+}
+
+
+std::pair<py::array_t<float>, py::array_t<uint8_t>> meshVerticeInpaint(py::array_t<float> texture,
+          py::array_t<uint8_t> mask,
+          py::array_t<float> vtx_pos, py::array_t<float> vtx_uv,
+          py::array_t<int> pos_idx, py::array_t<int> uv_idx, const std::string& method = "smooth") {
+    if (method == "smooth") {
+        return meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx);
+    } else {
+        throw std::invalid_argument("Invalid method. Use 'smooth' or 'forward'.");
+    }
+}
+
+PYBIND11_MODULE(mesh_processor, m) {
+    m.def("meshVerticeInpaint", &meshVerticeInpaint, "A function to process mesh",
+          py::arg("texture"), py::arg("mask"),
+          py::arg("vtx_pos"), py::arg("vtx_uv"),
+          py::arg("pos_idx"), py::arg("uv_idx"),
+          py::arg("method") = "smooth");
+}
\ No newline at end of file
diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.py b/hy3dgen/texgen/differentiable_renderer/mesh_processor.py
new file mode 100644
index 0000000..5a731cc
--- /dev/null
+++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.py
@@ -0,0 +1,84 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import numpy as np
+
+def meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx):
+    texture_height, texture_width, texture_channel = texture.shape
+    vtx_num = vtx_pos.shape[0]
+
+    vtx_mask = np.zeros(vtx_num, dtype=np.float32)
+    vtx_color = [np.zeros(texture_channel, dtype=np.float32) for _ in range(vtx_num)]
+    uncolored_vtxs = []
+    G = [[] for _ in range(vtx_num)]
+
+    for i in range(uv_idx.shape[0]):
+        for k in range(3):
+            vtx_uv_idx = uv_idx[i, k]
+            vtx_idx = pos_idx[i, k]
+            uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1)))
+            uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1)))
+            if mask[uv_u, uv_v] > 0:
+                vtx_mask[vtx_idx] = 1.0
+                vtx_color[vtx_idx] = texture[uv_u, uv_v]
+            else:
+                uncolored_vtxs.append(vtx_idx)
+            G[pos_idx[i, k]].append(pos_idx[i, (k + 1) % 3])
+
+    smooth_count = 2
+    last_uncolored_vtx_count = 0
+    while smooth_count > 0:
+        uncolored_vtx_count = 0
+        for vtx_idx in uncolored_vtxs:
+            sum_color = np.zeros(texture_channel, dtype=np.float32)
+            total_weight = 0.0
+            vtx_0 = vtx_pos[vtx_idx]
+            for connected_idx in G[vtx_idx]:
+                if vtx_mask[connected_idx] > 0:
+                    vtx1 = vtx_pos[connected_idx]
+                    dist = np.sqrt(np.sum((vtx_0 - vtx1) ** 2))
+                    dist_weight = 1.0 / max(dist, 1e-4)
+                    dist_weight *= dist_weight
+                    sum_color += vtx_color[connected_idx] * dist_weight
+                    total_weight += dist_weight
+            if total_weight > 0:
+                vtx_color[vtx_idx] = sum_color / total_weight
+                vtx_mask[vtx_idx] = 1.0
+            else:
+                uncolored_vtx_count += 1
+
+        if last_uncolored_vtx_count == uncolored_vtx_count:
+            smooth_count -= 1
+        else:
+            smooth_count += 1
+        last_uncolored_vtx_count = uncolored_vtx_count
+
+    new_texture = texture.copy()
+    new_mask = mask.copy()
+    for face_idx in range(uv_idx.shape[0]):
+        for k in range(3):
+            vtx_uv_idx = uv_idx[face_idx, k]
+            vtx_idx = pos_idx[face_idx, k]
+            if vtx_mask[vtx_idx] == 1.0:
+                uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1)))
+                uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1)))
+                new_texture[uv_u, uv_v] = vtx_color[vtx_idx]
+                new_mask[uv_u, uv_v] = 255
+    return new_texture, new_mask
+
+def meshVerticeInpaint(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx, method="smooth"):
+    if method == "smooth":
+        return meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx)
+    else:
+        raise ValueError("Invalid method. Use 'smooth' or 'forward'.")
\ No newline at end of file
diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_render.py b/hy3dgen/texgen/differentiable_renderer/mesh_render.py
new file mode 100644
index 0000000..6f83a36
--- /dev/null
+++ b/hy3dgen/texgen/differentiable_renderer/mesh_render.py
@@ -0,0 +1,823 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+import trimesh
+from PIL import Image
+
+from .camera_utils import (
+    transform_pos,
+    get_mv_matrix,
+    get_orthographic_projection_matrix,
+    get_perspective_projection_matrix,
+)
+from .mesh_processor import meshVerticeInpaint
+from .mesh_utils import load_mesh, save_mesh
+
+
+def stride_from_shape(shape):
+    stride = [1]
+    for x in reversed(shape[1:]):
+        stride.append(stride[-1] * x)
+    return list(reversed(stride))
+
+
+def scatter_add_nd_with_count(input, count, indices, values, weights=None):
+    # input: [..., C], D dimension + C channel
+    # count: [..., 1], D dimension
+    # indices: [N, D], long
+    # values: [N, C]
+
+    D = indices.shape[-1]
+    C = input.shape[-1]
+    size = input.shape[:-1]
+    stride = stride_from_shape(size)
+
+    assert len(size) == D
+
+    input = input.view(-1, C)  # [HW, C]
+    count = count.view(-1, 1)
+
+    flatten_indices = (indices * torch.tensor(stride,
+                                              dtype=torch.long, device=indices.device)).sum(-1)  # [N]
+
+    if weights is None:
+        weights = torch.ones_like(values[..., :1])
+
+    input.scatter_add_(0, flatten_indices.unsqueeze(1).repeat(1, C), values)
+    count.scatter_add_(0, flatten_indices.unsqueeze(1), weights)
+
+    return input.view(*size, C), count.view(*size, 1)
+
+
+def linear_grid_put_2d(H, W, coords, values, return_count=False):
+    # coords: [N, 2], float in [0, 1]
+    # values: [N, C]
+
+    C = values.shape[-1]
+
+    indices = coords * torch.tensor(
+        [H - 1, W - 1], dtype=torch.float32, device=coords.device
+    )
+    indices_00 = indices.floor().long()  # [N, 2]
+    indices_00[:, 0].clamp_(0, H - 2)
+    indices_00[:, 1].clamp_(0, W - 2)
+    indices_01 = indices_00 + torch.tensor(
+        [0, 1], dtype=torch.long, device=indices.device
+    )
+    indices_10 = indices_00 + torch.tensor(
+        [1, 0], dtype=torch.long, device=indices.device
+    )
+    indices_11 = indices_00 + torch.tensor(
+        [1, 1], dtype=torch.long, device=indices.device
+    )
+
+    h = indices[..., 0] - indices_00[..., 0].float()
+    w = indices[..., 1] - indices_00[..., 1].float()
+    w_00 = (1 - h) * (1 - w)
+    w_01 = (1 - h) * w
+    w_10 = h * (1 - w)
+    w_11 = h * w
+
+    result = torch.zeros(H, W, C, device=values.device,
+                         dtype=values.dtype)  # [H, W, C]
+    count = torch.zeros(H, W, 1, device=values.device,
+                        dtype=values.dtype)  # [H, W, 1]
+    weights = torch.ones_like(values[..., :1])  # [N, 1]
+
+    result, count = scatter_add_nd_with_count(
+        result, count, indices_00, values * w_00.unsqueeze(1), weights * w_00.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(
+        result, count, indices_01, values * w_01.unsqueeze(1), weights * w_01.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(
+        result, count, indices_10, values * w_10.unsqueeze(1), weights * w_10.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(
+        result, count, indices_11, values * w_11.unsqueeze(1), weights * w_11.unsqueeze(1))
+
+    if return_count:
+        return result, count
+
+    mask = (count.squeeze(-1) > 0)
+    result[mask] = result[mask] / count[mask].repeat(1, C)
+
+    return result
+
+
+class MeshRender():
+    def __init__(
+        self,
+        camera_distance=1.45, camera_type='orth',
+        default_resolution=1024, texture_size=1024,
+        use_antialias=True, max_mip_level=None, filter_mode='linear',
+        bake_mode='linear', raster_mode='cr', device='cuda'):
+
+        self.device = device
+
+        self.set_default_render_resolution(default_resolution)
+        self.set_default_texture_resolution(texture_size)
+
+        self.camera_distance = camera_distance
+        self.use_antialias = use_antialias
+        self.max_mip_level = max_mip_level
+        self.filter_mode = filter_mode
+
+        self.bake_angle_thres = 75
+        self.bake_unreliable_kernel_size = int(
+            (2 / 512) * max(self.default_resolution[0], self.default_resolution[1]))
+        self.bake_mode = bake_mode
+
+        self.raster_mode = raster_mode
+        if self.raster_mode == 'cr':
+            import custom_rasterizer as cr
+            self.raster = cr
+        else:
+            raise f'No raster named {self.raster_mode}'
+
+        if camera_type == 'orth':
+            self.ortho_scale = 1.2
+            self.camera_proj_mat = get_orthographic_projection_matrix(
+                left=-self.ortho_scale * 0.5, right=self.ortho_scale * 0.5,
+                bottom=-self.ortho_scale * 0.5, top=self.ortho_scale * 0.5,
+                near=0.1, far=100
+            )
+        elif camera_type == 'perspective':
+            self.camera_proj_mat = get_perspective_projection_matrix(
+                49.13, self.default_resolution[1] / self.default_resolution[0],
+                0.01, 100.0
+            )
+        else:
+            raise f'No camera type {camera_type}'
+
+    def raster_rasterize(self, pos, tri, resolution, ranges=None, grad_db=True):
+
+        if self.raster_mode == 'cr':
+            rast_out_db = None
+            if pos.dim() == 2:
+                pos = pos.unsqueeze(0)
+            findices, barycentric = self.raster.rasterize(pos, tri, resolution)
+            rast_out = torch.cat((barycentric, findices.unsqueeze(-1)), dim=-1)
+            rast_out = rast_out.unsqueeze(0)
+        else:
+            raise f'No raster named {self.raster_mode}'
+
+        return rast_out, rast_out_db
+
+    def raster_interpolate(self, uv, rast_out, uv_idx, rast_db=None, diff_attrs=None):
+
+        if self.raster_mode == 'cr':
+            textd = None
+            barycentric = rast_out[0, ..., :-1]
+            findices = rast_out[0, ..., -1]
+            if uv.dim() == 2:
+                uv = uv.unsqueeze(0)
+            textc = self.raster.interpolate(uv, findices, barycentric, uv_idx)
+        else:
+            raise f'No raster named {self.raster_mode}'
+
+        return textc, textd
+
+    def raster_texture(self, tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto',
+                       boundary_mode='wrap', max_mip_level=None):
+
+        if self.raster_mode == 'cr':
+            raise f'Texture is not implemented in cr'
+        else:
+            raise f'No raster named {self.raster_mode}'
+
+        return color
+
+    def raster_antialias(self, color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0):
+
+        if self.raster_mode == 'cr':
+            # Antialias has not been supported yet
+            color = color
+        else:
+            raise f'No raster named {self.raster_mode}'
+
+        return color
+
+    def load_mesh(
+        self,
+        mesh,
+        scale_factor=1.15,
+        auto_center=True,
+    ):
+        vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data = load_mesh(mesh)
+        self.mesh_copy = mesh
+        self.set_mesh(vtx_pos, pos_idx,
+                      vtx_uv=vtx_uv, uv_idx=uv_idx,
+                      scale_factor=scale_factor, auto_center=auto_center
+                      )
+        if texture_data is not None:
+            self.set_texture(texture_data)
+
+    def save_mesh(self):
+        texture_data = self.get_texture()
+        texture_data = Image.fromarray((texture_data * 255).astype(np.uint8))
+        return save_mesh(self.mesh_copy, texture_data)
+
+    def set_mesh(
+        self,
+        vtx_pos, pos_idx,
+        vtx_uv=None, uv_idx=None,
+        scale_factor=1.15, auto_center=True
+    ):
+
+        self.vtx_pos = torch.from_numpy(vtx_pos).to(self.device).float()
+        self.pos_idx = torch.from_numpy(pos_idx).to(self.device).to(torch.int)
+        if (vtx_uv is not None) and (uv_idx is not None):
+            self.vtx_uv = torch.from_numpy(vtx_uv).to(self.device).float()
+            self.uv_idx = torch.from_numpy(uv_idx).to(self.device).to(torch.int)
+        else:
+            self.vtx_uv = None
+            self.uv_idx = None
+
+        self.vtx_pos[:, [0, 1]] = -self.vtx_pos[:, [0, 1]]
+        self.vtx_pos[:, [1, 2]] = self.vtx_pos[:, [2, 1]]
+        if (vtx_uv is not None) and (uv_idx is not None):
+            self.vtx_uv[:, 1] = 1.0 - self.vtx_uv[:, 1]
+
+        if auto_center:
+            max_bb = (self.vtx_pos - 0).max(0)[0]
+            min_bb = (self.vtx_pos - 0).min(0)[0]
+            center = (max_bb + min_bb) / 2
+            scale = torch.norm(self.vtx_pos - center, dim=1).max() * 2.0
+            self.vtx_pos = (self.vtx_pos - center) * \
+                           (scale_factor / float(scale))
+            self.scale_factor = scale_factor
+
+    def set_texture(self, tex):
+        if isinstance(tex, np.ndarray):
+            tex = Image.fromarray((tex * 255).astype(np.uint8))
+        elif isinstance(tex, torch.Tensor):
+            tex = tex.cpu().numpy()
+            tex = Image.fromarray((tex * 255).astype(np.uint8))
+
+        tex = tex.resize(self.texture_size).convert('RGB')
+        tex = np.array(tex) / 255.0
+        self.tex = torch.from_numpy(tex).to(self.device)
+        self.tex = self.tex.float()
+
+    def set_default_render_resolution(self, default_resolution):
+        if isinstance(default_resolution, int):
+            default_resolution = (default_resolution, default_resolution)
+        self.default_resolution = default_resolution
+
+    def set_default_texture_resolution(self, texture_size):
+        if isinstance(texture_size, int):
+            texture_size = (texture_size, texture_size)
+        self.texture_size = texture_size
+
+    def get_mesh(self):
+        vtx_pos = self.vtx_pos.cpu().numpy()
+        pos_idx = self.pos_idx.cpu().numpy()
+        vtx_uv = self.vtx_uv.cpu().numpy()
+        uv_idx = self.uv_idx.cpu().numpy()
+
+        # 坐标变换的逆变换
+        vtx_pos[:, [1, 2]] = vtx_pos[:, [2, 1]]
+        vtx_pos[:, [0, 1]] = -vtx_pos[:, [0, 1]]
+
+        vtx_uv[:, 1] = 1.0 - vtx_uv[:, 1]
+        return vtx_pos, pos_idx, vtx_uv, uv_idx
+
+    def get_texture(self):
+        return self.tex.cpu().numpy()
+
+    def to(self, device):
+        self.device = device
+
+        for attr_name in dir(self):
+            attr_value = getattr(self, attr_name)
+            if isinstance(attr_value, torch.Tensor):
+                setattr(self, attr_name, attr_value.to(self.device))
+
+    def color_rgb_to_srgb(self, image):
+        if isinstance(image, Image.Image):
+            image_rgb = torch.tesnor(
+                np.array(image) /
+                255.0).float().to(
+                self.device)
+        elif isinstance(image, np.ndarray):
+            image_rgb = torch.tensor(image).float()
+        else:
+            image_rgb = image.to(self.device)
+
+        image_srgb = torch.where(
+            image_rgb <= 0.0031308,
+            12.92 * image_rgb,
+            1.055 * torch.pow(image_rgb, 1 / 2.4) - 0.055
+        )
+
+        if isinstance(image, Image.Image):
+            image_srgb = Image.fromarray(
+                (image_srgb.cpu().numpy() *
+                 255).astype(
+                    np.uint8))
+        elif isinstance(image, np.ndarray):
+            image_srgb = image_srgb.cpu().numpy()
+        else:
+            image_srgb = image_srgb.to(image.device)
+
+        return image_srgb
+
+    def _render(
+        self,
+        glctx,
+        mvp,
+        pos,
+        pos_idx,
+        uv,
+        uv_idx,
+        tex,
+        resolution,
+        max_mip_level,
+        keep_alpha,
+        filter_mode
+    ):
+        pos_clip = transform_pos(mvp, pos)
+        if isinstance(resolution, (int, float)):
+            resolution = [resolution, resolution]
+        rast_out, rast_out_db = self.raster_rasterize(
+            glctx, pos_clip, pos_idx, resolution=resolution)
+
+        tex = tex.contiguous()
+        if filter_mode == 'linear-mipmap-linear':
+            texc, texd = self.raster_interpolate(
+                uv[None, ...], rast_out, uv_idx, rast_db=rast_out_db, diff_attrs='all')
+            color = self.raster_texture(
+                tex[None, ...], texc, texd, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level)
+        else:
+            texc, _ = self.raster_interpolate(uv[None, ...], rast_out, uv_idx)
+            color = self.raster_texture(tex[None, ...], texc, filter_mode=filter_mode)
+
+        visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
+        color = color * visible_mask  # Mask out background.
+        if self.use_antialias:
+            color = self.raster_antialias(color, rast_out, pos_clip, pos_idx)
+
+        if keep_alpha:
+            color = torch.cat([color, visible_mask], dim=-1)
+        return color[0, ...]
+
+    def render(
+        self,
+        elev,
+        azim,
+        camera_distance=None,
+        center=None,
+        resolution=None,
+        tex=None,
+        keep_alpha=True,
+        bgcolor=None,
+        filter_mode=None,
+        return_type='th'
+    ):
+
+        proj = self.camera_proj_mat
+        r_mv = get_mv_matrix(
+            elev=elev,
+            azim=azim,
+            camera_distance=self.camera_distance if camera_distance is None else camera_distance,
+            center=center)
+        r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+        if tex is not None:
+            if isinstance(tex, Image.Image):
+                tex = torch.tensor(np.array(tex) / 255.0)
+            elif isinstance(tex, np.ndarray):
+                tex = torch.tensor(tex)
+            if tex.dim() == 2:
+                tex = tex.unsqueeze(-1)
+            tex = tex.float().to(self.device)
+        image = self._render(r_mvp, self.vtx_pos, self.pos_idx, self.vtx_uv, self.uv_idx,
+                             self.tex if tex is None else tex,
+                             self.default_resolution if resolution is None else resolution,
+                             self.max_mip_level, True, filter_mode if filter_mode else self.filter_mode)
+        mask = (image[..., [-1]] == 1).float()
+        if bgcolor is None:
+            bgcolor = [0 for _ in range(image.shape[-1] - 1)]
+        image = image * mask + (1 - mask) * \
+                torch.tensor(bgcolor + [0]).to(self.device)
+        if keep_alpha == False:
+            image = image[..., :-1]
+        if return_type == 'np':
+            image = image.cpu().numpy()
+        elif return_type == 'pl':
+            image = image.squeeze(-1).cpu().numpy() * 255
+            image = Image.fromarray(image.astype(np.uint8))
+        return image
+
+    def render_normal(
+        self,
+        elev,
+        azim,
+        camera_distance=None,
+        center=None,
+        resolution=None,
+        bg_color=[1, 1, 1],
+        use_abs_coor=False,
+        normalize_rgb=True,
+        return_type='th'
+    ):
+
+        pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center)
+        if resolution is None:
+            resolution = self.default_resolution
+        if isinstance(resolution, (int, float)):
+            resolution = [resolution, resolution]
+        rast_out, rast_out_db = self.raster_rasterize(
+            pos_clip, self.pos_idx, resolution=resolution)
+
+        if use_abs_coor:
+            mesh_triangles = self.vtx_pos[self.pos_idx[:, :3], :]
+        else:
+            pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4]
+            mesh_triangles = pos_camera[self.pos_idx[:, :3], :]
+        face_normals = F.normalize(
+            torch.cross(mesh_triangles[:,
+                        1,
+                        :] - mesh_triangles[:,
+                             0,
+                             :],
+                        mesh_triangles[:,
+                        2,
+                        :] - mesh_triangles[:,
+                             0,
+                             :],
+                        dim=-1),
+            dim=-1)
+
+        vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0],
+                                                              faces=self.pos_idx.cpu(),
+                                                              face_normals=face_normals.cpu(), )
+        vertex_normals = torch.from_numpy(
+            vertex_normals).float().to(self.device).contiguous()
+
+        # Interpolate normal values across the rasterized pixels
+        normal, _ = self.raster_interpolate(
+            vertex_normals[None, ...], rast_out, self.pos_idx)
+
+        visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
+        normal = normal * visible_mask + \
+                 torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 -
+                                                                                    visible_mask)  
+
+        if normalize_rgb:
+            normal = (normal + 1) * 0.5
+        if self.use_antialias:
+            normal = self.raster_antialias(normal, rast_out, pos_clip, self.pos_idx)
+
+        image = normal[0, ...]
+        if return_type == 'np':
+            image = image.cpu().numpy()
+        elif return_type == 'pl':
+            image = image.cpu().numpy() * 255
+            image = Image.fromarray(image.astype(np.uint8))
+
+        return image
+
+    def convert_normal_map(self, image):
+        # blue is front, red is left, green is top
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+        mask = (image == [255, 255, 255]).all(axis=-1)
+
+        image = (image / 255.0) * 2.0 - 1.0
+
+        image[..., [1]] = -image[..., [1]]
+        image[..., [1, 2]] = image[..., [2, 1]]
+        image[..., [0]] = -image[..., [0]]
+
+        image = (image + 1.0) * 0.5
+
+        image = (image * 255).astype(np.uint8)
+        image[mask] = [127, 127, 255]
+
+        return Image.fromarray(image)
+
+    def get_pos_from_mvp(self, elev, azim, camera_distance, center):
+        proj = self.camera_proj_mat
+        r_mv = get_mv_matrix(
+            elev=elev,
+            azim=azim,
+            camera_distance=self.camera_distance if camera_distance is None else camera_distance,
+            center=center)
+
+        pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True)
+        pos_clip = transform_pos(proj, pos_camera)
+
+        return pos_camera, pos_clip
+
+    def render_depth(
+        self,
+        elev,
+        azim,
+        camera_distance=None,
+        center=None,
+        resolution=None,
+        return_type='th'
+    ):
+        pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center)
+
+        if resolution is None:
+            resolution = self.default_resolution
+        if isinstance(resolution, (int, float)):
+            resolution = [resolution, resolution]
+        rast_out, rast_out_db = self.raster_rasterize(
+            pos_clip, self.pos_idx, resolution=resolution)
+
+        pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4]
+        tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous()
+
+        # Interpolate depth values across the rasterized pixels
+        depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx)
+
+        visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
+        depth_max, depth_min = depth[visible_mask >
+                                     0].max(), depth[visible_mask > 0].min()
+        depth = (depth - depth_min) / (depth_max - depth_min)
+
+        depth = depth * visible_mask  # Mask out background.
+        if self.use_antialias:
+            depth = self.raster_antialias(depth, rast_out, pos_clip, self.pos_idx)
+
+        image = depth[0, ...]
+        if return_type == 'np':
+            image = image.cpu().numpy()
+        elif return_type == 'pl':
+            image = image.squeeze(-1).cpu().numpy() * 255
+            image = Image.fromarray(image.astype(np.uint8))
+        return image
+
+    def render_position(self, elev, azim, camera_distance=None, center=None,
+                        resolution=None, bg_color=[1, 1, 1], return_type='th'):
+        pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center)
+        if resolution is None:
+            resolution = self.default_resolution
+        if isinstance(resolution, (int, float)):
+            resolution = [resolution, resolution]
+        rast_out, rast_out_db = self.raster_rasterize(
+            pos_clip, self.pos_idx, resolution=resolution)
+
+        tex_position = 0.5 - self.vtx_pos[:, :3] / self.scale_factor
+        tex_position = tex_position.contiguous()
+
+        # Interpolate depth values across the rasterized pixels
+        position, _ = self.raster_interpolate(
+            tex_position[None, ...], rast_out, self.pos_idx)
+
+        visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
+
+        position = position * visible_mask + \
+                   torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 -
+                                                                                      visible_mask) 
+        if self.use_antialias:
+            position = self.raster_antialias(position, rast_out, pos_clip, self.pos_idx)
+
+        image = position[0, ...]
+
+        if return_type == 'np':
+            image = image.cpu().numpy()
+        elif return_type == 'pl':
+            image = image.squeeze(-1).cpu().numpy() * 255
+            image = Image.fromarray(image.astype(np.uint8))
+        return image
+
+    def render_uvpos(self, return_type='th'):
+        image = self.uv_feature_map(self.vtx_pos * 0.5 + 0.5)
+        if return_type == 'np':
+            image = image.cpu().numpy()
+        elif return_type == 'pl':
+            image = image.cpu().numpy() * 255
+            image = Image.fromarray(image.astype(np.uint8))
+        return image
+
+    def uv_feature_map(self, vert_feat, bg=None):
+        vtx_uv = self.vtx_uv * 2 - 1.0
+        vtx_uv = torch.cat(
+            [vtx_uv, torch.zeros_like(self.vtx_uv)], dim=1).unsqueeze(0)
+        vtx_uv[..., -1] = 1
+        uv_idx = self.uv_idx
+        rast_out, rast_out_db = self.raster_rasterize(
+            vtx_uv, uv_idx, resolution=self.texture_size)
+        feat_map, _ = self.raster_interpolate(vert_feat[None, ...], rast_out, uv_idx)
+        feat_map = feat_map[0, ...]
+        if bg is not None:
+            visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...]
+            feat_map[visible_mask == 0] = bg
+        return feat_map
+
+    def render_sketch_from_geometry(self, normal_image, depth_image):
+        normal_image_np = normal_image.cpu().numpy()
+        depth_image_np = depth_image.cpu().numpy()
+
+        normal_image_np = (normal_image_np * 255).astype(np.uint8)
+        depth_image_np = (depth_image_np * 255).astype(np.uint8)
+        normal_image_np = cv2.cvtColor(normal_image_np, cv2.COLOR_RGB2GRAY)
+
+        normal_edges = cv2.Canny(normal_image_np, 80, 150)
+        depth_edges = cv2.Canny(depth_image_np, 30, 80)
+
+        combined_edges = np.maximum(normal_edges, depth_edges)
+
+        sketch_image = torch.from_numpy(combined_edges).to(
+            normal_image.device).float() / 255.0
+        sketch_image = sketch_image.unsqueeze(-1)
+
+        return sketch_image
+
+    def render_sketch_from_depth(self, depth_image):
+        depth_image_np = depth_image.cpu().numpy()
+        depth_image_np = (depth_image_np * 255).astype(np.uint8)
+        depth_edges = cv2.Canny(depth_image_np, 30, 80)
+        combined_edges = depth_edges
+        sketch_image = torch.from_numpy(combined_edges).to(
+            depth_image.device).float() / 255.0
+        sketch_image = sketch_image.unsqueeze(-1)
+        return sketch_image
+
+    def back_project(self, image, elev, azim,
+                     camera_distance=None, center=None, method=None):
+        if isinstance(image, Image.Image):
+            image = torch.tensor(np.array(image) / 255.0)
+        elif isinstance(image, np.ndarray):
+            image = torch.tensor(image)
+        if image.dim() == 2:
+            image = image.unsqueeze(-1)
+        image = image.float().to(self.device)
+        resolution = image.shape[:2]
+        channel = image.shape[-1]
+        texture = torch.zeros(self.texture_size + (channel,)).to(self.device)
+        cos_map = torch.zeros(self.texture_size + (1,)).to(self.device)
+
+        proj = self.camera_proj_mat
+        r_mv = get_mv_matrix(
+            elev=elev,
+            azim=azim,
+            camera_distance=self.camera_distance if camera_distance is None else camera_distance,
+            center=center)
+        pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True)
+        pos_clip = transform_pos(proj, pos_camera)
+        pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4]
+        v0 = pos_camera[self.pos_idx[:, 0], :]
+        v1 = pos_camera[self.pos_idx[:, 1], :]
+        v2 = pos_camera[self.pos_idx[:, 2], :]
+        face_normals = F.normalize(
+            torch.cross(
+                v1 - v0,
+                v2 - v0,
+                dim=-1),
+            dim=-1)
+        vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0],
+                                                              faces=self.pos_idx.cpu(),
+                                                              face_normals=face_normals.cpu(), )
+        vertex_normals = torch.from_numpy(
+            vertex_normals).float().to(self.device).contiguous()
+        tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous()
+        rast_out, rast_out_db = self.raster_rasterize(
+            pos_clip, self.pos_idx, resolution=resolution)
+        visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...]
+
+        normal, _ = self.raster_interpolate(
+            vertex_normals[None, ...], rast_out, self.pos_idx)
+        normal = normal[0, ...]
+        uv, _ = self.raster_interpolate(self.vtx_uv[None, ...], rast_out, self.uv_idx)
+        depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx)
+        depth = depth[0, ...]
+
+        depth_max, depth_min = depth[visible_mask >
+                                     0].max(), depth[visible_mask > 0].min()
+        depth_normalized = (depth - depth_min) / (depth_max - depth_min)
+        depth_image = depth_normalized * visible_mask  # Mask out background.
+
+        sketch_image = self.render_sketch_from_depth(depth_image)
+
+        lookat = torch.tensor([[0, 0, -1]], device=self.device)
+        cos_image = torch.nn.functional.cosine_similarity(
+            lookat, normal.view(-1, 3))
+        cos_image = cos_image.view(normal.shape[0], normal.shape[1], 1)
+
+        cos_thres = np.cos(self.bake_angle_thres / 180 * np.pi)
+        cos_image[cos_image < cos_thres] = 0
+
+        # shrink
+        kernel_size = self.bake_unreliable_kernel_size * 2 + 1
+        kernel = torch.ones(
+            (1, 1, kernel_size, kernel_size), dtype=torch.float32).to(
+            sketch_image.device)
+
+        visible_mask = visible_mask.permute(2, 0, 1).unsqueeze(0).float()
+        visible_mask = F.conv2d(
+            1.0 - visible_mask,
+            kernel,
+            padding=kernel_size // 2)
+        visible_mask = 1.0 - (visible_mask > 0).float()  # 二值化
+        visible_mask = visible_mask.squeeze(0).permute(1, 2, 0)
+
+        sketch_image = sketch_image.permute(2, 0, 1).unsqueeze(0)
+        sketch_image = F.conv2d(sketch_image, kernel, padding=kernel_size // 2)
+        sketch_image = (sketch_image > 0).float()  # 二值化
+        sketch_image = sketch_image.squeeze(0).permute(1, 2, 0)
+        visible_mask = visible_mask * (sketch_image < 0.5)
+
+        cos_image[visible_mask == 0] = 0
+
+        method = self.bake_mode if method is None else method
+
+        if method == 'linear':
+            proj_mask = (visible_mask != 0).view(-1)
+            uv = uv.squeeze(0).contiguous().view(-1, 2)[proj_mask]
+            image = image.squeeze(0).contiguous().view(-1, channel)[proj_mask]
+            cos_image = cos_image.contiguous().view(-1, 1)[proj_mask]
+            sketch_image = sketch_image.contiguous().view(-1, 1)[proj_mask]
+
+            texture = linear_grid_put_2d(
+                self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], image)
+            cos_map = linear_grid_put_2d(
+                self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], cos_image)
+            boundary_map = linear_grid_put_2d(
+                self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], sketch_image)
+        else:
+            raise f'No bake mode {method}'
+
+        return texture, cos_map, boundary_map
+
+    def bake_texture(self, colors, elevs, azims,
+                     camera_distance=None, center=None, exp=6, weights=None):
+        for i in range(len(colors)):
+            if isinstance(colors[i], Image.Image):
+                colors[i] = torch.tensor(
+                    np.array(
+                        colors[i]) / 255.0,
+                    device=self.device).float()
+        if weights is None:
+            weights = [1.0 for _ in range(colors)]
+        textures = []
+        cos_maps = []
+        for color, elev, azim, weight in zip(colors, elevs, azims, weights):
+            texture, cos_map, _ = self.back_project(
+                color, elev, azim, camera_distance, center)
+            cos_map = weight * (cos_map ** exp)
+            textures.append(texture)
+            cos_maps.append(cos_map)
+
+        texture_merge, trust_map_merge = self.fast_bake_texture(
+            textures, cos_maps)
+        return texture_merge, trust_map_merge
+
+    @torch.no_grad()
+    def fast_bake_texture(self, textures, cos_maps):
+
+        channel = textures[0].shape[-1]
+        texture_merge = torch.zeros(
+            self.texture_size + (channel,)).to(self.device)
+        trust_map_merge = torch.zeros(self.texture_size + (1,)).to(self.device)
+        for texture, cos_map in zip(textures, cos_maps):
+            view_sum = (cos_map > 0).sum()
+            painted_sum = ((cos_map > 0) * (trust_map_merge > 0)).sum()
+            if painted_sum / view_sum > 0.99:
+                continue
+            texture_merge += texture * cos_map
+            trust_map_merge += cos_map
+        texture_merge = texture_merge / torch.clamp(trust_map_merge, min=1E-8)
+
+        return texture_merge, trust_map_merge > 1E-8
+
+    def uv_inpaint(self, texture, mask):
+
+        if isinstance(texture, torch.Tensor):
+            texture_np = texture.cpu().numpy()
+        elif isinstance(texture, np.ndarray):
+            texture_np = texture
+        elif isinstance(texture, Image.Image):
+            texture_np = np.array(texture) / 255.0
+
+        vtx_pos, pos_idx, vtx_uv, uv_idx = self.get_mesh()
+
+        texture_np, mask = meshVerticeInpaint(
+            texture_np, mask, vtx_pos, vtx_uv, pos_idx, uv_idx)
+
+        texture_np = cv2.inpaint(
+            (texture_np *
+             255).astype(
+                np.uint8),
+            255 -
+            mask,
+            3,
+            cv2.INPAINT_NS)
+
+        return texture_np
diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_utils.py b/hy3dgen/texgen/differentiable_renderer/mesh_utils.py
new file mode 100644
index 0000000..fa5694a
--- /dev/null
+++ b/hy3dgen/texgen/differentiable_renderer/mesh_utils.py
@@ -0,0 +1,34 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import trimesh
+
+
+def load_mesh(mesh):
+    vtx_pos = mesh.vertices if hasattr(mesh, 'vertices') else None
+    pos_idx = mesh.faces if hasattr(mesh, 'faces') else None
+
+    vtx_uv = mesh.visual.uv if hasattr(mesh.visual, 'uv') else None
+    uv_idx = mesh.faces if hasattr(mesh, 'faces') else None
+
+    texture_data = None
+
+    return vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data
+
+
+def save_mesh(mesh, texture_data):
+    material = trimesh.visual.texture.SimpleMaterial(image=texture_data, diffuse=(255, 255, 255))
+    texture_visuals = trimesh.visual.TextureVisuals(uv=mesh.visual.uv, image=texture_data, material=material)
+    mesh.visual = texture_visuals
+    return mesh
diff --git a/hy3dgen/texgen/differentiable_renderer/setup.py b/hy3dgen/texgen/differentiable_renderer/setup.py
new file mode 100644
index 0000000..1bfdb10
--- /dev/null
+++ b/hy3dgen/texgen/differentiable_renderer/setup.py
@@ -0,0 +1,62 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+from setuptools import setup, Extension
+import pybind11
+import sys
+import platform
+
+def get_platform_specific_args():
+    system = platform.system().lower()
+    cpp_std = 'c++14'  # Make configurable if needed
+    
+    if sys.platform == 'win32':
+        compile_args = ['/O2', f'/std:{cpp_std}', '/EHsc', '/MP', '/DWIN32_LEAN_AND_MEAN', '/bigobj']
+        link_args = []
+        extra_includes = []
+    elif system == 'linux':
+        compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra', '-pthread']
+        link_args = ['-fPIC', '-pthread']
+        extra_includes = []
+    elif sys.platform == 'darwin':
+        compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra',
+                       '-stdlib=libc++', '-mmacosx-version-min=10.14']
+        link_args = ['-fPIC', '-stdlib=libc++', '-mmacosx-version-min=10.14', '-dynamiclib']
+        extra_includes = []
+    else:
+        raise RuntimeError(f"Unsupported platform: {system}")
+    
+    return compile_args, link_args, extra_includes
+
+extra_compile_args, extra_link_args, platform_includes = get_platform_specific_args()
+include_dirs = [pybind11.get_include(), pybind11.get_include(user=True)]
+include_dirs.extend(platform_includes)
+
+ext_modules = [
+    Extension(
+        "mesh_processor",
+        ["mesh_processor.cpp"],
+        include_dirs=include_dirs,
+        language='c++',
+        extra_compile_args=extra_compile_args,
+        extra_link_args=extra_link_args,
+    ),
+]
+
+setup(
+    name="mesh_processor",
+    ext_modules=ext_modules,
+    install_requires=['pybind11>=2.6.0'],
+    python_requires='>=3.6',
+)
\ No newline at end of file
diff --git a/hy3dgen/texgen/hunyuanpaint/__init__.py b/hy3dgen/texgen/hunyuanpaint/__init__.py
new file mode 100644
index 0000000..8bb2bf8
--- /dev/null
+++ b/hy3dgen/texgen/hunyuanpaint/__init__.py
@@ -0,0 +1,13 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
\ No newline at end of file
diff --git a/hy3dgen/texgen/hunyuanpaint/pipeline.py b/hy3dgen/texgen/hunyuanpaint/pipeline.py
new file mode 100644
index 0000000..38f3777
--- /dev/null
+++ b/hy3dgen/texgen/hunyuanpaint/pipeline.py
@@ -0,0 +1,722 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy
+import numpy as np
+import torch
+import torch.distributed
+import torch.utils.checkpoint
+import transformers
+from PIL import Image
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    EulerAncestralDiscreteScheduler,
+    UNet2DConditionModel,
+    ImagePipelineOutput
+)
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import PipelineImageInput
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline, \
+    retrieve_timesteps, rescale_noise_cfg
+from diffusers.schedulers import KarrasDiffusionSchedulers, LCMScheduler
+from diffusers.utils import deprecate
+from einops import rearrange
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from .unet.modules import UNet2p5DConditionModel, \
+    compute_multi_resolution_mask, compute_multi_resolution_discrete_voxel_indice
+
+def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32):
+    """
+    See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+    Args:
+        timesteps (`torch.Tensor`):
+            generate embedding vectors at these timesteps
+        embedding_dim (`int`, *optional*, defaults to 512):
+            dimension of the embeddings to generate
+        dtype:
+            data type of the generated embeddings
+
+    Returns:
+        `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+    """
+    assert len(w.shape) == 1
+    w = w * 1000.0
+
+    half_dim = embedding_dim // 2
+    emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+    emb = w.to(dtype)[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = torch.nn.functional.pad(emb, (0, 1))
+    assert emb.shape == (w.shape[0], embedding_dim)
+    return emb
+
+
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
+    return x[(...,) + (None,) * dims_to_append]
+
+
+# From LCMScheduler.get_scalings_for_boundary_condition_discrete
+def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=10.0):
+    scaled_timestep = timestep_scaling * timestep
+    c_skip = sigma_data ** 2 / (scaled_timestep ** 2 + sigma_data ** 2)
+    c_out = scaled_timestep / (scaled_timestep ** 2 + sigma_data ** 2) ** 0.5
+    return c_skip, c_out
+
+
+# Compare LCMScheduler.step, Step 4
+def get_predicted_original_sample(model_output, timesteps, sample, prediction_type, alphas, sigmas, N_gen):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape, N_gen)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape, N_gen)
+    model_output = rearrange(model_output, '(b n) c h w -> b n c h w', n=N_gen)
+    if prediction_type == "epsilon":
+        pred_x_0 = (sample - sigmas * model_output) / alphas
+    elif prediction_type == "sample":
+        pred_x_0 = model_output
+    elif prediction_type == "v_prediction":
+        pred_x_0 = alphas * sample - sigmas * model_output
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_x_0
+
+
+# Based on step 4 in DDIMScheduler.step
+def get_predicted_noise(model_output, timesteps, sample, prediction_type, alphas, sigmas, N_gen):
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape, N_gen)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape, N_gen)
+    model_output = rearrange(model_output, '(b n) c h w -> b n c h w', n=N_gen)
+    if prediction_type == "epsilon":
+        pred_epsilon = model_output
+    elif prediction_type == "sample":
+        pred_epsilon = (sample - alphas * model_output) / sigmas
+    elif prediction_type == "v_prediction":
+        pred_epsilon = alphas * model_output + sigmas * sample
+    else:
+        raise ValueError(
+            f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`"
+            f" are supported."
+        )
+
+    return pred_epsilon
+    
+def extract_into_tensor(a, t, x_shape, N_gen):
+    # b, *_ = t.shape
+    out = a.gather(-1, t)
+    out = out.repeat(N_gen)
+    out = rearrange(out, '(b n) -> b n', n=N_gen)
+    b, c, *_ = out.shape
+    return out.reshape(b, c, *((1,) * (len(x_shape) - 2)))
+
+class DDIMSolver:
+    def __init__(self, alpha_cumprods, timesteps=1000, ddim_timesteps=50):
+        # DDIM sampling parameters
+        step_ratio = timesteps // ddim_timesteps
+        self.ddim_timesteps = (np.arange(1, ddim_timesteps + 1) * step_ratio).round().astype(np.int64) - 1
+        self.ddim_alpha_cumprods = alpha_cumprods[self.ddim_timesteps]
+        self.ddim_alpha_cumprods_prev = np.asarray(
+            [alpha_cumprods[0]] + alpha_cumprods[self.ddim_timesteps[:-1]].tolist()
+        )
+        # convert to torch tensors
+        self.ddim_timesteps = torch.from_numpy(self.ddim_timesteps).long()
+        self.ddim_alpha_cumprods = torch.from_numpy(self.ddim_alpha_cumprods)
+        self.ddim_alpha_cumprods_prev = torch.from_numpy(self.ddim_alpha_cumprods_prev)
+
+    def to(self, device):
+        self.ddim_timesteps = self.ddim_timesteps.to(device)
+        self.ddim_alpha_cumprods = self.ddim_alpha_cumprods.to(device)
+        self.ddim_alpha_cumprods_prev = self.ddim_alpha_cumprods_prev.to(device)
+        return self
+
+    def ddim_step(self, pred_x0, pred_noise, timestep_index, N_gen):
+        alpha_cumprod_prev = extract_into_tensor(self.ddim_alpha_cumprods_prev, timestep_index, pred_x0.shape, N_gen)
+        dir_xt = (1.0 - alpha_cumprod_prev).sqrt() * pred_noise
+        x_prev = alpha_cumprod_prev.sqrt() * pred_x0 + dir_xt
+        return x_prev
+
+
+@torch.no_grad()
+def update_ema(target_params, source_params, rate=0.99):
+    """
+    Update target parameters to be closer to those of source parameters using
+    an exponential moving average.
+
+    :param target_params: the target parameter sequence.
+    :param source_params: the source parameter sequence.
+    :param rate: the EMA rate (closer to 1 means slower).
+    """
+
+    for targ, src in zip(target_params, source_params):
+        targ.detach().mul_(rate).add_(src, alpha=1 - rate)
+        
+def to_rgb_image(maybe_rgba: Image.Image):
+    if maybe_rgba.mode == 'RGB':
+        return maybe_rgba
+    elif maybe_rgba.mode == 'RGBA':
+        rgba = maybe_rgba
+        img = numpy.random.randint(127, 128, size=[rgba.size[1], rgba.size[0], 3], dtype=numpy.uint8)
+        img = Image.fromarray(img, 'RGB')
+        img.paste(rgba, mask=rgba.getchannel('A'))
+        return img
+    else:
+        raise ValueError("Unsupported image type.", maybe_rgba.mode)
+
+
+class HunyuanPaintPipeline(StableDiffusionPipeline):
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2p5DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        feature_extractor: CLIPImageProcessor,
+        safety_checker=None,
+        use_torch_compile=False,
+    ):
+        DiffusionPipeline.__init__(self)
+
+        safety_checker = None
+        self.register_modules(
+            vae=torch.compile(vae) if use_torch_compile else vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=torch.compile(feature_extractor) if use_torch_compile else feature_extractor,
+        )
+        self.solver = DDIMSolver(
+            scheduler.alphas_cumprod.numpy(),
+            timesteps=scheduler.config.num_train_timesteps,
+            ddim_timesteps=30,
+        ).to('cuda')
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.is_turbo = False
+
+    def set_turbo(self, is_turbo: bool):
+        self.is_turbo = is_turbo
+        
+    @torch.no_grad()
+    def encode_images(self, images):
+        B = images.shape[0]
+        images = rearrange(images, 'b n c h w -> (b n) c h w')
+
+        dtype = next(self.vae.parameters()).dtype
+        images = (images - 0.5) * 2.0
+        posterior = self.vae.encode(images.to(dtype)).latent_dist
+        latents = posterior.sample() * self.vae.config.scaling_factor
+
+        latents = rearrange(latents, '(b n) c h w -> b n c h w', b=B)
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Image.Image = None,
+        prompt=None,
+        negative_prompt='watermark, ugly, deformed, noisy, blurry, low contrast',
+        *args,
+        num_images_per_prompt: Optional[int] = 1,
+        guidance_scale=2.0,
+        output_type: Optional[str] = "pil",
+        width=512,
+        height=512,
+        num_inference_steps=28,
+        return_dict=True,
+        **cached_condition,
+    ):
+        device = self._execution_device
+
+        if image is None:
+            raise ValueError("Inputting embeddings not supported for this pipeline. Please pass an image.")
+        assert not isinstance(image, torch.Tensor)
+
+        if not isinstance(image, List):
+            image = [image]
+            
+        image = [to_rgb_image(img) for img in image]
+
+        image_vae = [torch.tensor(np.array(img) / 255.0) for img in image]
+        image_vae = [img_vae.unsqueeze(0).permute(0, 3, 1, 2).unsqueeze(0) for img_vae in image_vae]
+        image_vae = torch.cat(image_vae, dim=1)
+        image_vae = image_vae.to(device=device, dtype=self.vae.dtype)
+
+        batch_size, N_ref = image_vae.shape[0], image_vae.shape[1]
+        assert batch_size == 1
+        assert num_images_per_prompt == 1
+
+        ref_latents = self.encode_images(image_vae)
+
+        def convert_pil_list_to_tensor(images):
+            bg_c = [1., 1., 1.]
+            images_tensor = []
+            for batch_imgs in images:
+                view_imgs = []
+                for pil_img in batch_imgs:
+                    img = numpy.asarray(pil_img, dtype=numpy.float32) / 255.
+                    if img.shape[2] > 3:
+                        alpha = img[:, :, 3:]
+                        img = img[:, :, :3] * alpha + bg_c * (1 - alpha)
+                    img = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).contiguous().half().to("cuda")
+                    view_imgs.append(img)
+                view_imgs = torch.cat(view_imgs, dim=0)
+                images_tensor.append(view_imgs.unsqueeze(0))
+
+            images_tensor = torch.cat(images_tensor, dim=0)
+            return images_tensor
+
+        if "normal_imgs" in cached_condition:
+
+            if isinstance(cached_condition["normal_imgs"], List):
+                cached_condition["normal_imgs"] = convert_pil_list_to_tensor(cached_condition["normal_imgs"])
+
+            cached_condition['normal_imgs'] = self.encode_images(cached_condition["normal_imgs"])
+
+        if "position_imgs" in cached_condition:
+
+            if isinstance(cached_condition["position_imgs"], List):
+                cached_condition["position_imgs"] = convert_pil_list_to_tensor(cached_condition["position_imgs"])
+
+            cached_condition['position_maps'] = cached_condition['position_imgs']            
+            cached_condition["position_imgs"] = self.encode_images(cached_condition["position_imgs"])
+
+        if 'camera_info_gen' in cached_condition:
+            camera_info = cached_condition['camera_info_gen']  # B,N
+            if isinstance(camera_info, List):
+                camera_info = torch.tensor(camera_info)
+            camera_info = camera_info.to(device).to(torch.int64)
+            cached_condition['camera_info_gen'] = camera_info
+        if 'camera_info_ref' in cached_condition:
+            camera_info = cached_condition['camera_info_ref']  # B,N
+            if isinstance(camera_info, List):
+                camera_info = torch.tensor(camera_info)
+            camera_info = camera_info.to(device).to(torch.int64)
+            cached_condition['camera_info_ref'] = camera_info
+
+        cached_condition['ref_latents'] = ref_latents
+
+        if self.is_turbo:
+            if 'position_maps' in cached_condition:
+                cached_condition['position_attn_mask'] = (
+                    compute_multi_resolution_mask(cached_condition['position_maps'])
+                )
+                cached_condition['position_voxel_indices'] = (
+                    compute_multi_resolution_discrete_voxel_indice(cached_condition['position_maps'])
+                )
+            
+        if (guidance_scale > 1) and (not self.is_turbo):
+            negative_ref_latents = torch.zeros_like(cached_condition['ref_latents'])
+            cached_condition['ref_latents'] = torch.cat([negative_ref_latents, cached_condition['ref_latents']])
+            cached_condition['ref_scale'] = torch.as_tensor([0.0, 1.0]).to(cached_condition['ref_latents'])
+            if "normal_imgs" in cached_condition:
+                cached_condition['normal_imgs'] = torch.cat(
+                    (cached_condition['normal_imgs'], cached_condition['normal_imgs']))
+
+            if "position_imgs" in cached_condition:
+                cached_condition['position_imgs'] = torch.cat(
+                    (cached_condition['position_imgs'], cached_condition['position_imgs']))
+
+            if 'position_maps' in cached_condition:
+                cached_condition['position_maps'] = torch.cat(
+                    (cached_condition['position_maps'], cached_condition['position_maps']))
+
+            if 'camera_info_gen' in cached_condition:
+                cached_condition['camera_info_gen'] = torch.cat(
+                    (cached_condition['camera_info_gen'], cached_condition['camera_info_gen']))
+            if 'camera_info_ref' in cached_condition:
+                cached_condition['camera_info_ref'] = torch.cat(
+                    (cached_condition['camera_info_ref'], cached_condition['camera_info_ref']))
+
+        prompt_embeds = self.unet.learned_text_clip_gen.repeat(num_images_per_prompt, 1, 1)
+        negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+
+        latents: torch.Tensor = self.denoise(
+            None,
+            *args,
+            cross_attention_kwargs=None,
+            guidance_scale=guidance_scale,
+            num_images_per_prompt=num_images_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            num_inference_steps=num_inference_steps,
+            output_type='latent',
+            width=width,
+            height=height,
+            **cached_condition
+        ).images
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
+
+    def denoise(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`]
+                (https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated,",
+                "consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated,",
+                "consider using `callback_on_step_end`",
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance if self.is_turbo else False,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if (self.do_classifier_free_guidance) and (not self.is_turbo):
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance if self.is_turbo else False,
+            )
+
+        # 4. Prepare 
+        if self.is_turbo:
+            bsz = 3
+            N_gen = 15
+            index = torch.range(29, 0, -bsz, device='cuda').long()
+            timesteps = self.solver.ddim_timesteps[index]
+            self.scheduler.set_timesteps(timesteps=timesteps.cpu(), device='cuda')
+        else:
+            timesteps, num_inference_steps = retrieve_timesteps(
+                self.scheduler, num_inference_steps, device, timesteps, sigmas
+            )
+            
+        assert num_images_per_prompt == 1
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * kwargs['num_in_batch'],  # num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
+            else None
+        )
+
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latents = rearrange(latents, '(b n) c h w -> b n c h w', n=kwargs['num_in_batch'])
+                latent_model_input = (
+                    torch.cat([latents] * 2) 
+                    if ((self.do_classifier_free_guidance) and (not self.is_turbo)) 
+                    else latents
+                )
+                latent_model_input = rearrange(latent_model_input, 'b n c h w -> (b n) c h w')
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = rearrange(latent_model_input, '(b n) c h w ->b n c h w', n=kwargs['num_in_batch'])
+
+                # predict the noise residual
+
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False, **kwargs
+                )[0]
+                latents = rearrange(latents, 'b n c h w -> (b n) c h w')
+                # perform guidance
+                if (self.do_classifier_free_guidance) and (not self.is_turbo):
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if (self.do_classifier_free_guidance) and (self.guidance_rescale > 0.0) and (not self.is_turbo):
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = \
+                    self.scheduler.step(noise_pred, t, latents[:, :num_channels_latents, :, :], **extra_step_kwargs,
+                                        return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/hy3dgen/texgen/hunyuanpaint/unet/__init__.py b/hy3dgen/texgen/hunyuanpaint/unet/__init__.py
new file mode 100644
index 0000000..8bb2bf8
--- /dev/null
+++ b/hy3dgen/texgen/hunyuanpaint/unet/__init__.py
@@ -0,0 +1,13 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
\ No newline at end of file
diff --git a/hy3dgen/texgen/hunyuanpaint/unet/modules.py b/hy3dgen/texgen/hunyuanpaint/unet/modules.py
new file mode 100644
index 0000000..f558cd7
--- /dev/null
+++ b/hy3dgen/texgen/hunyuanpaint/unet/modules.py
@@ -0,0 +1,599 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import copy
+import json
+import os
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models import UNet2DConditionModel
+from diffusers.models.attention_processor import Attention
+from diffusers.models.transformers.transformer_2d import BasicTransformerBlock
+from einops import rearrange
+
+
+def _chunked_feed_forward(ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int):
+    # "feed_forward_chunk_size" can be used to save memory
+    if hidden_states.shape[chunk_dim] % chunk_size != 0:
+        raise ValueError(
+            f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]}"
+            f"has to be divisible by chunk size: {chunk_size}."
+            f" Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+        )
+
+    num_chunks = hidden_states.shape[chunk_dim] // chunk_size
+    ff_output = torch.cat(
+        [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
+        dim=chunk_dim,
+    )
+    return ff_output
+
+
+class Basic2p5DTransformerBlock(torch.nn.Module):
+    def __init__(self, transformer: BasicTransformerBlock,layer_name,use_ma=True,use_ra=True,is_turbo=False) -> None:
+        super().__init__()
+        self.transformer = transformer
+        self.layer_name = layer_name
+        self.use_ma = use_ma
+        self.use_ra = use_ra
+        self.is_turbo = is_turbo
+
+        # multiview attn
+        if self.use_ma:
+            self.attn_multiview = Attention(
+                query_dim=self.dim,
+                heads=self.num_attention_heads,
+                dim_head=self.attention_head_dim,
+                dropout=self.dropout,
+                bias=self.attention_bias,
+                cross_attention_dim=None,
+                upcast_attention=self.attn1.upcast_attention,
+                out_bias=True,
+            )
+
+        # ref attn
+        if self.use_ra:
+            self.attn_refview = Attention(
+                query_dim=self.dim,
+                heads=self.num_attention_heads,
+                dim_head=self.attention_head_dim,
+                dropout=self.dropout,
+                bias=self.attention_bias,
+                cross_attention_dim=None,
+                upcast_attention=self.attn1.upcast_attention,
+                out_bias=True,
+            )
+        if self.is_turbo:
+            self._initialize_attn_weights()
+
+    def _initialize_attn_weights(self):
+
+        if self.use_ma:
+            self.attn_multiview.load_state_dict(self.attn1.state_dict()) 
+            with torch.no_grad():
+                for layer in self.attn_multiview.to_out:
+                    for param in layer.parameters():
+                        param.zero_()
+        if self.use_ra:
+            self.attn_refview.load_state_dict(self.attn1.state_dict()) 
+            with torch.no_grad():
+                for layer in self.attn_refview.to_out:
+                    for param in layer.parameters():
+                        param.zero_()
+
+    def __getattr__(self, name: str):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.transformer, name)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        num_in_batch = cross_attention_kwargs.pop('num_in_batch', 1)
+        mode = cross_attention_kwargs.pop('mode', None)
+        if not self.is_turbo:
+            mva_scale = cross_attention_kwargs.pop('mva_scale', 1.0)
+            ref_scale = cross_attention_kwargs.pop('ref_scale', 1.0)
+        else:
+            position_attn_mask = cross_attention_kwargs.pop("position_attn_mask", None)
+            position_voxel_indices = cross_attention_kwargs.pop("position_voxel_indices", None)
+            mva_scale = 1.0
+            ref_scale = 1.0
+            
+        condition_embed_dict = cross_attention_kwargs.pop("condition_embed_dict", None)
+
+        if self.norm_type == "ada_norm":
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.norm_type == "ada_norm_zero":
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
+            norm_hidden_states = self.norm1(hidden_states)
+        elif self.norm_type == "ada_norm_continuous":
+            norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif self.norm_type == "ada_norm_single":
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+            ).chunk(6, dim=1)
+            norm_hidden_states = self.norm1(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+        else:
+            raise ValueError("Incorrect norm used")
+
+        if self.pos_embed is not None:
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+        # 1. Prepare GLIGEN inputs
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+
+        if self.norm_type == "ada_norm_zero":
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        elif self.norm_type == "ada_norm_single":
+            attn_output = gate_msa * attn_output
+
+        hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        # 1.2 Reference Attention
+        if 'w' in mode:
+            condition_embed_dict[self.layer_name] = rearrange(
+                norm_hidden_states, '(b n) l c -> b (n l) c',
+                n=num_in_batch
+            )  # B, (N L), C
+
+        if 'r' in mode and self.use_ra:
+            condition_embed = condition_embed_dict[self.layer_name].unsqueeze(1).repeat(1, num_in_batch, 1,
+                                                                                        1)  # B N L C
+            condition_embed = rearrange(condition_embed, 'b n l c -> (b n) l c')
+
+            attn_output = self.attn_refview(
+                norm_hidden_states,
+                encoder_hidden_states=condition_embed,
+                attention_mask=None,
+                **cross_attention_kwargs
+            )
+            if not self.is_turbo:
+                ref_scale_timing = ref_scale
+                if isinstance(ref_scale, torch.Tensor):
+                    ref_scale_timing = ref_scale.unsqueeze(1).repeat(1, num_in_batch).view(-1)
+                    for _ in range(attn_output.ndim - 1):
+                        ref_scale_timing = ref_scale_timing.unsqueeze(-1)
+                        
+            hidden_states = ref_scale_timing * attn_output + hidden_states
+
+            if hidden_states.ndim == 4:
+                hidden_states = hidden_states.squeeze(1)
+
+        # 1.3 Multiview Attention
+        if num_in_batch > 1 and self.use_ma:
+            multivew_hidden_states = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c', n=num_in_batch)
+
+            if self.is_turbo:
+                position_mask = None
+                if position_attn_mask is not None:
+                    if multivew_hidden_states.shape[1] in position_attn_mask:
+                        position_mask = position_attn_mask[multivew_hidden_states.shape[1]]
+                position_indices = None
+                if position_voxel_indices is not None:
+                    if multivew_hidden_states.shape[1] in position_voxel_indices:
+                        position_indices = position_voxel_indices[multivew_hidden_states.shape[1]]
+                attn_output = self.attn_multiview(
+                    multivew_hidden_states,
+                    encoder_hidden_states=multivew_hidden_states,
+                    attention_mask=position_mask,
+                    position_indices=position_indices,
+                    **cross_attention_kwargs
+                )
+            else:
+                attn_output = self.attn_multiview(
+                    multivew_hidden_states,
+                    encoder_hidden_states=multivew_hidden_states,
+                    **cross_attention_kwargs
+                )
+
+            attn_output = rearrange(attn_output, 'b (n l) c -> (b n) l c', n=num_in_batch)
+            
+            hidden_states = mva_scale * attn_output + hidden_states
+            if hidden_states.ndim == 4:
+                hidden_states = hidden_states.squeeze(1)
+
+        # 1.2 GLIGEN Control
+        if gligen_kwargs is not None:
+            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            if self.norm_type == "ada_norm":
+                norm_hidden_states = self.norm2(hidden_states, timestep)
+            elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
+                norm_hidden_states = self.norm2(hidden_states)
+            elif self.norm_type == "ada_norm_single":
+                # For PixArt norm2 isn't applied here:
+                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+                norm_hidden_states = hidden_states
+            elif self.norm_type == "ada_norm_continuous":
+                norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
+            else:
+                raise ValueError("Incorrect norm")
+
+            if self.pos_embed is not None and self.norm_type != "ada_norm_single":
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+
+            hidden_states = attn_output + hidden_states
+
+        # 4. Feed-forward
+        # i2vgen doesn't have this norm 🤷‍♂️
+        if self.norm_type == "ada_norm_continuous":
+            norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif not self.norm_type == "ada_norm_single":
+            norm_hidden_states = self.norm3(hidden_states)
+
+        if self.norm_type == "ada_norm_zero":
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+        if self.norm_type == "ada_norm_single":
+            norm_hidden_states = self.norm2(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            ff_output = self.ff(norm_hidden_states)
+
+        if self.norm_type == "ada_norm_zero":
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        elif self.norm_type == "ada_norm_single":
+            ff_output = gate_mlp * ff_output
+
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        return hidden_states
+
+@torch.no_grad()
+def compute_voxel_grid_mask(position, grid_resolution=8):
+
+    position = position.half()    
+    B,N,_,H,W = position.shape
+    assert H%grid_resolution==0 and W%grid_resolution==0
+
+    valid_mask = (position != 1).all(dim=2, keepdim=True)
+    valid_mask = valid_mask.expand_as(position)
+    position[valid_mask==False] = 0
+
+    
+    position = rearrange(
+        position,
+        'b n c (num_h grid_h) (num_w grid_w) -> b n num_h num_w c grid_h grid_w', 
+        num_h=grid_resolution, num_w=grid_resolution
+    )
+    valid_mask = rearrange(
+        valid_mask, 
+        'b n c (num_h grid_h) (num_w grid_w) -> b n num_h num_w c grid_h grid_w', 
+        num_h=grid_resolution, num_w=grid_resolution
+    )
+
+    grid_position = position.sum(dim=(-2, -1))
+    count_masked = valid_mask.sum(dim=(-2, -1))
+
+    grid_position = grid_position / count_masked.clamp(min=1)
+    grid_position[count_masked<5] = 0
+
+    grid_position = grid_position.permute(0,1,4,2,3)
+    grid_position = rearrange(grid_position, 'b n c h w -> b n (h w) c')
+
+    grid_position_expanded_1 = grid_position.unsqueeze(2).unsqueeze(4)  # 形状变为 B, N, 1, L, 1, 3
+    grid_position_expanded_2 = grid_position.unsqueeze(1).unsqueeze(3)  # 形状变为 B, 1, N, 1, L, 3
+
+    # 计算欧氏距离
+    distances = torch.norm(grid_position_expanded_1 - grid_position_expanded_2, dim=-1)  # 形状为 B, N, N, L, L
+
+    weights = distances
+    grid_distance = 1.73/grid_resolution
+    
+    #weights = weights*-32
+    #weights = weights.clamp(min=-10000.0)
+    
+    weights = weights< grid_distance
+
+    return weights
+    
+def compute_multi_resolution_mask(position_maps, grid_resolutions=[32, 16, 8]):
+    position_attn_mask = {}
+    with torch.no_grad():
+        for grid_resolution in grid_resolutions:
+            position_mask = compute_voxel_grid_mask(position_maps, grid_resolution)
+            position_mask = rearrange(position_mask, 'b ni nj li lj -> b (ni li) (nj lj)')
+            position_attn_mask[position_mask.shape[1]] = position_mask
+    return position_attn_mask
+
+@torch.no_grad()
+def compute_discrete_voxel_indice(position, grid_resolution=8, voxel_resolution=128):
+
+    position = position.half()    
+    B,N,_,H,W = position.shape
+    assert H%grid_resolution==0 and W%grid_resolution==0
+
+    valid_mask = (position != 1).all(dim=2, keepdim=True)
+    valid_mask = valid_mask.expand_as(position)
+    position[valid_mask==False] = 0
+    
+    position = rearrange(
+        position, 
+        'b n c (num_h grid_h) (num_w grid_w) -> b n num_h num_w c grid_h grid_w', 
+        num_h=grid_resolution, num_w=grid_resolution
+    )
+    valid_mask = rearrange(
+        valid_mask, 
+        'b n c (num_h grid_h) (num_w grid_w) -> b n num_h num_w c grid_h grid_w', 
+        num_h=grid_resolution, num_w=grid_resolution
+    )
+
+    grid_position = position.sum(dim=(-2, -1))
+    count_masked = valid_mask.sum(dim=(-2, -1))
+
+    grid_position = grid_position / count_masked.clamp(min=1)
+    grid_position[count_masked<5] = 0
+
+    grid_position = grid_position.permute(0,1,4,2,3).clamp(0, 1) # B N C H W
+    voxel_indices = grid_position * (voxel_resolution - 1)
+    voxel_indices = torch.round(voxel_indices).long()
+    return voxel_indices
+    
+def compute_multi_resolution_discrete_voxel_indice(
+    position_maps, 
+    grid_resolutions=[64, 32, 16, 8], 
+    voxel_resolutions=[512, 256, 128, 64]
+):
+    voxel_indices = {}
+    with torch.no_grad():
+        for grid_resolution, voxel_resolution in zip(grid_resolutions, voxel_resolutions):
+            voxel_indice = compute_discrete_voxel_indice(position_maps, grid_resolution, voxel_resolution)
+            voxel_indice = rearrange(voxel_indice, 'b n c h w -> b (n h w) c')
+            voxel_indices[voxel_indice.shape[1]] = {'voxel_indices':voxel_indice, 'voxel_resolution':voxel_resolution}
+    return voxel_indices
+
+class UNet2p5DConditionModel(torch.nn.Module):
+    def __init__(self, unet: UNet2DConditionModel) -> None:
+        super().__init__()
+        self.unet = unet
+
+        self.use_ma = True
+        self.use_ra = True
+        self.use_camera_embedding = True
+        self.use_dual_stream = True
+        self.is_turbo = False
+
+        if self.use_dual_stream:
+            self.unet_dual = copy.deepcopy(unet)
+            self.init_attention(self.unet_dual)
+        self.init_attention(self.unet, use_ma=self.use_ma, use_ra=self.use_ra, is_turbo=self.is_turbo)
+        self.init_condition()
+        self.init_camera_embedding()
+
+    @staticmethod
+    def from_pretrained(pretrained_model_name_or_path, **kwargs):
+        torch_dtype = kwargs.pop('torch_dtype', torch.float32)
+        config_path = os.path.join(pretrained_model_name_or_path, 'config.json')
+        unet_ckpt_path = os.path.join(pretrained_model_name_or_path, 'diffusion_pytorch_model.bin')
+        with open(config_path, 'r', encoding='utf-8') as file:
+            config = json.load(file)
+        unet = UNet2DConditionModel(**config)
+        unet = UNet2p5DConditionModel(unet)
+        unet_ckpt = torch.load(unet_ckpt_path, map_location='cpu', weights_only=True)
+        unet.load_state_dict(unet_ckpt, strict=True)
+        unet = unet.to(torch_dtype)
+        return unet
+
+    def init_condition(self):
+        self.unet.conv_in = torch.nn.Conv2d(
+            12,
+            self.unet.conv_in.out_channels,
+            kernel_size=self.unet.conv_in.kernel_size,
+            stride=self.unet.conv_in.stride,
+            padding=self.unet.conv_in.padding,
+            dilation=self.unet.conv_in.dilation,
+            groups=self.unet.conv_in.groups,
+            bias=self.unet.conv_in.bias is not None)
+
+        self.unet.learned_text_clip_gen = nn.Parameter(torch.randn(1, 77, 1024))
+        self.unet.learned_text_clip_ref = nn.Parameter(torch.randn(1, 77, 1024))
+
+    def init_camera_embedding(self):
+
+        if self.use_camera_embedding:
+            time_embed_dim = 1280
+            self.max_num_ref_image = 5
+            self.max_num_gen_image = 12 * 3 + 4 * 2
+            self.unet.class_embedding = nn.Embedding(self.max_num_ref_image + self.max_num_gen_image, time_embed_dim)
+
+    def init_attention(self, unet, use_ma=False, use_ra=False, is_turbo=False):
+
+        for down_block_i, down_block in enumerate(unet.down_blocks):
+            if hasattr(down_block, "has_cross_attention") and down_block.has_cross_attention:
+                for attn_i, attn in enumerate(down_block.attentions):
+                    for transformer_i, transformer in enumerate(attn.transformer_blocks):
+                        if isinstance(transformer, BasicTransformerBlock):
+                            attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(
+                                transformer,
+                                f'down_{down_block_i}_{attn_i}_{transformer_i}',
+                                use_ma, use_ra, is_turbo
+                            )
+
+        if hasattr(unet.mid_block, "has_cross_attention") and unet.mid_block.has_cross_attention:
+            for attn_i, attn in enumerate(unet.mid_block.attentions):
+                for transformer_i, transformer in enumerate(attn.transformer_blocks):
+                    if isinstance(transformer, BasicTransformerBlock):
+                        attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(
+                            transformer,
+                            f'mid_{attn_i}_{transformer_i}',
+                            use_ma, use_ra, is_turbo
+                        )
+
+        for up_block_i, up_block in enumerate(unet.up_blocks):
+            if hasattr(up_block, "has_cross_attention") and up_block.has_cross_attention:
+                for attn_i, attn in enumerate(up_block.attentions):
+                    for transformer_i, transformer in enumerate(attn.transformer_blocks):
+                        if isinstance(transformer, BasicTransformerBlock):
+                            attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(
+                                transformer,
+                                f'up_{up_block_i}_{attn_i}_{transformer_i}',
+                                use_ma, use_ra, is_turbo
+                            )
+
+    def __getattr__(self, name: str):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.unet, name)
+
+    def forward(
+        self, sample, timestep, encoder_hidden_states,
+        *args, down_intrablock_additional_residuals=None,
+        down_block_res_samples=None, mid_block_res_sample=None,
+        **cached_condition,
+    ):
+        B, N_gen, _, H, W = sample.shape
+        assert H == W
+
+        if self.use_camera_embedding:
+            camera_info_gen = cached_condition['camera_info_gen'] + self.max_num_ref_image
+            camera_info_gen = rearrange(camera_info_gen, 'b n -> (b n)')
+        else:
+            camera_info_gen = None
+
+        sample = [sample]
+        if 'normal_imgs' in cached_condition:
+            sample.append(cached_condition["normal_imgs"])
+        if 'position_imgs' in cached_condition:
+            sample.append(cached_condition["position_imgs"])
+        sample = torch.cat(sample, dim=2)
+
+        sample = rearrange(sample, 'b n c h w -> (b n) c h w')
+
+        encoder_hidden_states_gen = encoder_hidden_states.unsqueeze(1).repeat(1, N_gen, 1, 1)
+        encoder_hidden_states_gen = rearrange(encoder_hidden_states_gen, 'b n l c -> (b n) l c')
+
+        if self.use_ra:
+            if 'condition_embed_dict' in cached_condition:
+                condition_embed_dict = cached_condition['condition_embed_dict']
+            else:
+                condition_embed_dict = {}
+                ref_latents = cached_condition['ref_latents']
+                N_ref = ref_latents.shape[1]
+                if self.use_camera_embedding:
+                    camera_info_ref = cached_condition['camera_info_ref']
+                    camera_info_ref = rearrange(camera_info_ref, 'b n -> (b n)')
+                else:
+                    camera_info_ref = None
+
+                ref_latents = rearrange(ref_latents, 'b n c h w -> (b n) c h w')
+
+                encoder_hidden_states_ref = self.unet.learned_text_clip_ref.unsqueeze(1).repeat(B, N_ref, 1, 1)
+                encoder_hidden_states_ref = rearrange(encoder_hidden_states_ref, 'b n l c -> (b n) l c')
+
+                noisy_ref_latents = ref_latents
+                timestep_ref = 0
+
+                if self.use_dual_stream:
+                    unet_ref = self.unet_dual
+                else:
+                    unet_ref = self.unet
+                unet_ref(
+                    noisy_ref_latents, timestep_ref,
+                    encoder_hidden_states=encoder_hidden_states_ref,
+                    class_labels=camera_info_ref,
+                    # **kwargs
+                    return_dict=False,
+                    cross_attention_kwargs={
+                        'mode': 'w', 'num_in_batch': N_ref,
+                        'condition_embed_dict': condition_embed_dict},
+                )
+                cached_condition['condition_embed_dict'] = condition_embed_dict
+        else:
+            condition_embed_dict = None
+
+        mva_scale = cached_condition.get('mva_scale', 1.0)
+        ref_scale = cached_condition.get('ref_scale', 1.0)
+
+        if self.is_turbo:
+            cross_attention_kwargs_ = {
+                'mode': 'r', 'num_in_batch': N_gen,
+                'condition_embed_dict': condition_embed_dict,
+                'position_attn_mask':position_attn_mask, 
+                'position_voxel_indices':position_voxel_indices,
+                'mva_scale': mva_scale,
+                'ref_scale': ref_scale,
+            }
+        else:
+            cross_attention_kwargs_ = {
+                'mode': 'r', 'num_in_batch': N_gen,
+                'condition_embed_dict': condition_embed_dict,
+                'mva_scale': mva_scale,
+                'ref_scale': ref_scale,
+            }
+        return self.unet(
+            sample, timestep,
+            encoder_hidden_states_gen, *args,
+            class_labels=camera_info_gen,
+            down_intrablock_additional_residuals=[
+                sample.to(dtype=self.unet.dtype) for sample in down_intrablock_additional_residuals
+            ] if down_intrablock_additional_residuals is not None else None,
+            down_block_additional_residuals=[
+                sample.to(dtype=self.unet.dtype) for sample in down_block_res_samples
+            ] if down_block_res_samples is not None else None,
+            mid_block_additional_residual=(
+                mid_block_res_sample.to(dtype=self.unet.dtype)
+                if mid_block_res_sample is not None else None
+            ),
+            return_dict=False,
+            cross_attention_kwargs=cross_attention_kwargs_,
+        )
diff --git a/hy3dgen/texgen/pipelines.py b/hy3dgen/texgen/pipelines.py
new file mode 100644
index 0000000..508a971
--- /dev/null
+++ b/hy3dgen/texgen/pipelines.py
@@ -0,0 +1,239 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+
+import logging
+import numpy as np
+import os
+import torch
+from PIL import Image
+from typing import List, Union, Optional
+
+
+from .differentiable_renderer.mesh_render import MeshRender
+from .utils.dehighlight_utils import Light_Shadow_Remover
+from .utils.multiview_utils import Multiview_Diffusion_Net
+from .utils.imagesuper_utils import Image_Super_Net
+from .utils.uv_warp_utils import mesh_uv_wrap
+
+logger = logging.getLogger(__name__)
+
+
+class Hunyuan3DTexGenConfig:
+
+    def __init__(self, light_remover_ckpt_path, multiview_ckpt_path, subfolder_name):
+        self.device = 'cuda'
+        self.light_remover_ckpt_path = light_remover_ckpt_path
+        self.multiview_ckpt_path = multiview_ckpt_path
+
+        self.candidate_camera_azims = [0, 90, 180, 270, 0, 180]
+        self.candidate_camera_elevs = [0, 0, 0, 0, 90, -90]
+        self.candidate_view_weights = [1, 0.1, 0.5, 0.1, 0.05, 0.05]
+
+        self.render_size = 2048
+        self.texture_size = 2048
+        self.bake_exp = 4
+        self.merge_method = 'fast'
+
+        self.pipe_dict = {'hunyuan3d-paint-v2-0': 'hunyuanpaint', 'hunyuan3d-paint-v2-0-turbo': 'hunyuanpaint-turbo'}
+        self.pipe_name = self.pipe_dict[subfolder_name]
+
+
+class Hunyuan3DPaintPipeline:
+    @classmethod
+    def from_pretrained(cls, model_path, subfolder='hunyuan3d-paint-v2-0-turbo'):
+        original_model_path = model_path
+        if not os.path.exists(model_path):
+            # try local path
+            base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen')
+            model_path = os.path.expanduser(os.path.join(base_dir, model_path))
+
+            delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0')
+            multiview_model_path = os.path.join(model_path, subfolder)
+
+            if not os.path.exists(delight_model_path) or not os.path.exists(multiview_model_path):
+                try:
+                    import huggingface_hub
+                    # download from huggingface
+                    model_path = huggingface_hub.snapshot_download(
+                        repo_id=original_model_path, allow_patterns=["hunyuan3d-delight-v2-0/*"]
+                    )
+                    model_path = huggingface_hub.snapshot_download(
+                        repo_id=original_model_path, allow_patterns=[f'{subfolder}/*']
+                    )
+                    delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0')
+                    multiview_model_path = os.path.join(model_path, subfolder)
+                    return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path, subfolder))
+                except ImportError:
+                    logger.warning(
+                        "You need to install HuggingFace Hub to load models from the hub."
+                    )
+                    raise RuntimeError(f"Model path {model_path} not found")
+            else:
+                return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path, subfolder))
+
+        raise FileNotFoundError(f"Model path {original_model_path} not found and we could not find it at huggingface")
+
+    def __init__(self, config):
+        self.config = config
+        self.models = {}
+        self.render = MeshRender(
+            default_resolution=self.config.render_size,
+            texture_size=self.config.texture_size)
+
+        self.load_models()
+
+    def load_models(self):
+        # empty cude cache
+        torch.cuda.empty_cache()
+        # Load model
+        self.models['delight_model'] = Light_Shadow_Remover(self.config)
+        self.models['multiview_model'] = Multiview_Diffusion_Net(self.config)
+        # self.models['super_model'] = Image_Super_Net(self.config)
+
+    def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
+        self.models['delight_model'].pipeline.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
+        self.models['multiview_model'].pipeline.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
+
+    def render_normal_multiview(self, camera_elevs, camera_azims, use_abs_coor=True):
+        normal_maps = []
+        for elev, azim in zip(camera_elevs, camera_azims):
+            normal_map = self.render.render_normal(
+                elev, azim, use_abs_coor=use_abs_coor, return_type='pl')
+            normal_maps.append(normal_map)
+
+        return normal_maps
+
+    def render_position_multiview(self, camera_elevs, camera_azims):
+        position_maps = []
+        for elev, azim in zip(camera_elevs, camera_azims):
+            position_map = self.render.render_position(
+                elev, azim, return_type='pl')
+            position_maps.append(position_map)
+
+        return position_maps
+
+    def bake_from_multiview(self, views, camera_elevs,
+                            camera_azims, view_weights, method='graphcut'):
+        project_textures, project_weighted_cos_maps = [], []
+        project_boundary_maps = []
+        for view, camera_elev, camera_azim, weight in zip(
+            views, camera_elevs, camera_azims, view_weights):
+            project_texture, project_cos_map, project_boundary_map = self.render.back_project(
+                view, camera_elev, camera_azim)
+            project_cos_map = weight * (project_cos_map ** self.config.bake_exp)
+            project_textures.append(project_texture)
+            project_weighted_cos_maps.append(project_cos_map)
+            project_boundary_maps.append(project_boundary_map)
+
+        if method == 'fast':
+            texture, ori_trust_map = self.render.fast_bake_texture(
+                project_textures, project_weighted_cos_maps)
+        else:
+            raise f'no method {method}'
+        return texture, ori_trust_map > 1E-8
+
+    def texture_inpaint(self, texture, mask):
+
+        texture_np = self.render.uv_inpaint(texture, mask)
+        texture = torch.tensor(texture_np / 255).float().to(texture.device)
+
+        return texture
+
+    def recenter_image(self, image, border_ratio=0.2):
+        if image.mode == 'RGB':
+            return image
+        elif image.mode == 'L':
+            image = image.convert('RGB')
+            return image
+
+        alpha_channel = np.array(image)[:, :, 3]
+        non_zero_indices = np.argwhere(alpha_channel > 0)
+        if non_zero_indices.size == 0:
+            raise ValueError("Image is fully transparent")
+
+        min_row, min_col = non_zero_indices.min(axis=0)
+        max_row, max_col = non_zero_indices.max(axis=0)
+
+        cropped_image = image.crop((min_col, min_row, max_col + 1, max_row + 1))
+
+        width, height = cropped_image.size
+        border_width = int(width * border_ratio)
+        border_height = int(height * border_ratio)
+
+        new_width = width + 2 * border_width
+        new_height = height + 2 * border_height
+
+        square_size = max(new_width, new_height)
+
+        new_image = Image.new('RGBA', (square_size, square_size), (255, 255, 255, 0))
+
+        paste_x = (square_size - new_width) // 2 + border_width
+        paste_y = (square_size - new_height) // 2 + border_height
+
+        new_image.paste(cropped_image, (paste_x, paste_y))
+        return new_image
+
+    @torch.no_grad()
+    def __call__(self, mesh, image):
+
+        if not isinstance(image, List):
+            image = [image]
+
+        images_prompt = []
+        for i in range(len(image)):
+            if isinstance(image[i], str):
+                image_prompt = Image.open(image[i])
+            else:
+                image_prompt = image[i]
+            images_prompt.append(image_prompt)
+            
+        images_prompt = [self.recenter_image(image_prompt) for image_prompt in images_prompt]
+
+        images_prompt = [self.models['delight_model'](image_prompt) for image_prompt in images_prompt]
+
+        mesh = mesh_uv_wrap(mesh)
+
+        self.render.load_mesh(mesh)
+
+        selected_camera_elevs, selected_camera_azims, selected_view_weights = \
+            self.config.candidate_camera_elevs, self.config.candidate_camera_azims, self.config.candidate_view_weights
+
+        normal_maps = self.render_normal_multiview(
+            selected_camera_elevs, selected_camera_azims, use_abs_coor=True)
+        position_maps = self.render_position_multiview(
+            selected_camera_elevs, selected_camera_azims)
+
+        camera_info = [(((azim // 30) + 9) % 12) // {-20: 1, 0: 1, 20: 1, -90: 3, 90: 3}[
+            elev] + {-20: 0, 0: 12, 20: 24, -90: 36, 90: 40}[elev] for azim, elev in
+                       zip(selected_camera_azims, selected_camera_elevs)]
+        multiviews = self.models['multiview_model'](images_prompt, normal_maps + position_maps, camera_info)
+
+        for i in range(len(multiviews)):
+            # multiviews[i] = self.models['super_model'](multiviews[i])
+            multiviews[i] = multiviews[i].resize(
+                (self.config.render_size, self.config.render_size))
+
+        texture, mask = self.bake_from_multiview(multiviews,
+                                                 selected_camera_elevs, selected_camera_azims, selected_view_weights,
+                                                 method=self.config.merge_method)
+
+        mask_np = (mask.squeeze(-1).cpu().numpy() * 255).astype(np.uint8)
+
+        texture = self.texture_inpaint(texture, mask_np)
+
+        self.render.set_texture(texture)
+        textured_mesh = self.render.save_mesh()
+
+        return textured_mesh
diff --git a/hy3dgen/texgen/utils/__init__.py b/hy3dgen/texgen/utils/__init__.py
new file mode 100644
index 0000000..8bb2bf8
--- /dev/null
+++ b/hy3dgen/texgen/utils/__init__.py
@@ -0,0 +1,13 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
\ No newline at end of file
diff --git a/hy3dgen/texgen/utils/alignImg4Tex_utils.py b/hy3dgen/texgen/utils/alignImg4Tex_utils.py
new file mode 100644
index 0000000..34df204
--- /dev/null
+++ b/hy3dgen/texgen/utils/alignImg4Tex_utils.py
@@ -0,0 +1,121 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import torch
+from diffusers import EulerAncestralDiscreteScheduler
+from diffusers import StableDiffusionControlNetPipeline, StableDiffusionXLControlNetImg2ImgPipeline, ControlNetModel, \
+    AutoencoderKL
+
+
+class Img2img_Control_Ip_adapter:
+    def __init__(self, device):
+        controlnet = ControlNetModel.from_pretrained('lllyasviel/control_v11f1p_sd15_depth', torch_dtype=torch.float16,
+                                                     variant="fp16", use_safetensors=True)
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            'runwayml/stable-diffusion-v1-5', controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
+        )
+        pipe.load_ip_adapter('h94/IP-Adapter', subfolder="models", weight_name="ip-adapter-plus_sd15.safetensors")
+        pipe.set_ip_adapter_scale(0.7)
+
+        pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
+        # pipe.enable_model_cpu_offload()
+        self.pipe = pipe.to(device)
+
+    def __call__(
+        self,
+        prompt,
+        control_image,
+        ip_adapter_image,
+        negative_prompt,
+        height=512,
+        width=512,
+        num_inference_steps=20,
+        guidance_scale=8.0,
+        controlnet_conditioning_scale=1.0,
+        output_type="pil",
+        **kwargs,
+    ):
+        results = self.pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=control_image,
+            ip_adapter_image=ip_adapter_image,
+            generator=torch.manual_seed(42),
+            seed=42,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+            strength=1,
+            # clip_skip=2,
+            height=height,
+            width=width,
+            output_type=output_type,
+            **kwargs,
+        ).images[0]
+        return results
+
+
+################################################################
+
+class HesModel:
+    def __init__(self, ):
+        controlnet_depth = ControlNetModel.from_pretrained(
+            'diffusers/controlnet-depth-sdxl-1.0',
+            torch_dtype=torch.float16,
+            variant="fp16",
+            use_safetensors=True
+        )
+        self.pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
+            'stabilityai/stable-diffusion-xl-base-1.0',
+            torch_dtype=torch.float16,
+            variant="fp16",
+            controlnet=controlnet_depth,
+            use_safetensors=True,
+        )
+        self.pipe.vae = AutoencoderKL.from_pretrained(
+            'madebyollin/sdxl-vae-fp16-fix',
+            torch_dtype=torch.float16
+        )
+
+        self.pipe.load_ip_adapter('h94/IP-Adapter', subfolder="sdxl_models", weight_name="ip-adapter_sdxl.safetensors")
+        self.pipe.set_ip_adapter_scale(0.7)
+        self.pipe.to("cuda")
+
+    def __call__(self,
+                 init_image,
+                 control_image,
+                 ip_adapter_image=None,
+                 prompt='3D image',
+                 negative_prompt='2D image',
+                 seed=42,
+                 strength=0.8,
+                 num_inference_steps=40,
+                 guidance_scale=7.5,
+                 controlnet_conditioning_scale=0.5,
+                 **kwargs
+                 ):
+        image = self.pipe(
+            prompt=prompt,
+            image=init_image,
+            control_image=control_image,
+            ip_adapter_image=ip_adapter_image,
+            negative_prompt=negative_prompt,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            strength=strength,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+            seed=seed,
+            **kwargs
+        ).images[0]
+        return image
diff --git a/hy3dgen/texgen/utils/counter_utils.py b/hy3dgen/texgen/utils/counter_utils.py
new file mode 100644
index 0000000..383a515
--- /dev/null
+++ b/hy3dgen/texgen/utils/counter_utils.py
@@ -0,0 +1,48 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+
+class RunningStats():
+    def __init__(self) -> None:
+        self.count = 0
+        self.sum = 0
+        self.mean = 0
+        self.min = None
+        self.max = None
+
+    def add_value(self, value):
+        self.count += 1
+        self.sum += value
+        self.mean = self.sum / self.count
+
+        if self.min is None or value < self.min:
+            self.min = value
+
+        if self.max is None or value > self.max:
+            self.max = value
+
+    def get_count(self):
+        return self.count
+
+    def get_sum(self):
+        return self.sum
+
+    def get_mean(self):
+        return self.mean
+
+    def get_min(self):
+        return self.min
+
+    def get_max(self):
+        return self.max
diff --git a/hy3dgen/texgen/utils/dehighlight_utils.py b/hy3dgen/texgen/utils/dehighlight_utils.py
new file mode 100644
index 0000000..9b52368
--- /dev/null
+++ b/hy3dgen/texgen/utils/dehighlight_utils.py
@@ -0,0 +1,110 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
+
+
+class Light_Shadow_Remover():
+    def __init__(self, config):
+        self.device = config.device
+        self.cfg_image = 1.5
+        self.cfg_text = 1.0
+
+        pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+            config.light_remover_ckpt_path,
+            torch_dtype=torch.float16,
+            safety_checker=None,
+        )
+        pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config)
+        pipeline.set_progress_bar_config(disable=True)
+
+        self.pipeline = pipeline.to(self.device, torch.float16)
+    
+    def recorrect_rgb(self, src_image, target_image, alpha_channel, scale=0.95):
+        
+        def flat_and_mask(bgr, a):
+            mask = torch.where(a > 0.5, True, False)
+            bgr_flat = bgr.reshape(-1, bgr.shape[-1])
+            mask_flat = mask.reshape(-1)
+            bgr_flat_masked = bgr_flat[mask_flat, :]
+            return bgr_flat_masked
+        
+        src_flat = flat_and_mask(src_image, alpha_channel)
+        target_flat = flat_and_mask(target_image, alpha_channel)
+        corrected_bgr = torch.zeros_like(src_image)
+
+        for i in range(3): 
+            src_mean, src_stddev = torch.mean(src_flat[:, i]), torch.std(src_flat[:, i])
+            target_mean, target_stddev = torch.mean(target_flat[:, i]), torch.std(target_flat[:, i])
+            corrected_bgr[:, :, i] = torch.clamp(
+                (src_image[:, :, i] - scale * src_mean) * 
+                (target_stddev / src_stddev) + scale * target_mean, 
+                0, 1)
+
+        src_mse = torch.mean((src_image - target_image) ** 2)
+        modify_mse = torch.mean((corrected_bgr - target_image) ** 2)
+        if src_mse < modify_mse:
+            corrected_bgr = torch.cat([src_image, alpha_channel], dim=-1)
+        else: 
+            corrected_bgr = torch.cat([corrected_bgr, alpha_channel], dim=-1)
+
+        return corrected_bgr
+
+    @torch.no_grad()
+    def __call__(self, image):
+
+        image = image.resize((512, 512))
+
+        if image.mode == 'RGBA':
+            image_array = np.array(image)
+            alpha_channel = image_array[:, :, 3]
+            erosion_size = 3
+            kernel = np.ones((erosion_size, erosion_size), np.uint8)
+            alpha_channel = cv2.erode(alpha_channel, kernel, iterations=1)
+            image_array[alpha_channel == 0, :3] = 255
+            image_array[:, :, 3] = alpha_channel
+            image = Image.fromarray(image_array)
+
+            image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device)
+            alpha = image_tensor[:, :, 3:]
+            rgb_target = image_tensor[:, :, :3]
+        else:
+            image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device)
+            alpha = torch.ones_like(image_tensor)[:, :, :1]
+            rgb_target = image_tensor[:, :, :3]
+
+        image = image.convert('RGB')
+
+        image = self.pipeline(
+            prompt="",
+            image=image,
+            generator=torch.manual_seed(42),
+            height=512,
+            width=512,
+            num_inference_steps=50,
+            image_guidance_scale=self.cfg_image,
+            guidance_scale=self.cfg_text,
+        ).images[0]
+
+        image_tensor = torch.tensor(np.array(image)/255.0).to(self.device)
+        rgb_src = image_tensor[:,:,:3]
+        image = self.recorrect_rgb(rgb_src, rgb_target, alpha)
+        image = image[:,:,:3]*image[:,:,3:] + torch.ones_like(image[:,:,:3])*(1.0-image[:,:,3:])
+        image = Image.fromarray((image.cpu().numpy()*255).astype(np.uint8))
+
+        return image
diff --git a/hy3dgen/texgen/utils/imagesuper_utils.py b/hy3dgen/texgen/utils/imagesuper_utils.py
new file mode 100644
index 0000000..0b893c5
--- /dev/null
+++ b/hy3dgen/texgen/utils/imagesuper_utils.py
@@ -0,0 +1,34 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import torch
+from diffusers import StableDiffusionUpscalePipeline
+
+class Image_Super_Net():
+    def __init__(self, config):
+        self.up_pipeline_x4 = StableDiffusionUpscalePipeline.from_pretrained(
+                        'stabilityai/stable-diffusion-x4-upscaler',
+                        torch_dtype=torch.float16,
+                    ).to(config.device)
+        self.up_pipeline_x4.set_progress_bar_config(disable=True)
+
+    def __call__(self, image, prompt=''):
+        with torch.no_grad():
+            upscaled_image = self.up_pipeline_x4(
+                prompt=[prompt],
+                image=image,
+                num_inference_steps=5,
+            ).images[0]
+
+        return upscaled_image
diff --git a/hy3dgen/texgen/utils/multiview_utils.py b/hy3dgen/texgen/utils/multiview_utils.py
new file mode 100644
index 0000000..4d6a6ba
--- /dev/null
+++ b/hy3dgen/texgen/utils/multiview_utils.py
@@ -0,0 +1,87 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import os
+import random
+
+import numpy as np
+import torch
+from typing import List
+from diffusers import DiffusionPipeline
+from diffusers import EulerAncestralDiscreteScheduler, LCMScheduler
+
+
+class Multiview_Diffusion_Net():
+    def __init__(self, config) -> None:
+        self.device = config.device
+        self.view_size = 512
+        multiview_ckpt_path = config.multiview_ckpt_path
+
+        current_file_path = os.path.abspath(__file__)
+        custom_pipeline_path = os.path.join(os.path.dirname(current_file_path), '..', 'hunyuanpaint')
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            multiview_ckpt_path,
+            custom_pipeline=custom_pipeline_path, torch_dtype=torch.float16)
+
+        if config.pipe_name in ['hunyuanpaint']:
+            pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config,
+                                                                             timestep_spacing='trailing')
+        elif config.pipe_name in ['hunyuanpaint-turbo']:
+            pipeline.scheduler = LCMScheduler.from_config(pipeline.scheduler.config,
+                                                        timestep_spacing='trailing')
+            pipeline.set_turbo(True)
+            # pipeline.prepare() 
+
+        pipeline.set_progress_bar_config(disable=True)
+        self.pipeline = pipeline.to(self.device)
+
+    def seed_everything(self, seed):
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        os.environ["PL_GLOBAL_SEED"] = str(seed)
+
+    def __call__(self, input_images, control_images, camera_info):
+
+        self.seed_everything(0)
+
+        if not isinstance(input_images, List):
+            input_images = [input_images]
+
+        input_images = [input_image.resize((self.view_size, self.view_size)) for input_image in input_images]
+        for i in range(len(control_images)):
+            control_images[i] = control_images[i].resize((self.view_size, self.view_size))
+            if control_images[i].mode == 'L':
+                control_images[i] = control_images[i].point(lambda x: 255 if x > 1 else 0, mode='1')
+
+        kwargs = dict(generator=torch.Generator(device=self.pipeline.device).manual_seed(0))
+
+        num_view = len(control_images) // 2
+        normal_image = [[control_images[i] for i in range(num_view)]]
+        position_image = [[control_images[i + num_view] for i in range(num_view)]]
+
+        camera_info_gen = [camera_info]
+        camera_info_ref = [[0]]
+        kwargs['width'] = self.view_size
+        kwargs['height'] = self.view_size
+        kwargs['num_in_batch'] = num_view
+        kwargs['camera_info_gen'] = camera_info_gen
+        kwargs['camera_info_ref'] = camera_info_ref
+        kwargs["normal_imgs"] = normal_image
+        kwargs["position_imgs"] = position_image
+
+        mvd_image = self.pipeline(input_images, num_inference_steps=30, **kwargs).images
+
+        return mvd_image
diff --git a/hy3dgen/texgen/utils/simplify_mesh_utils.py b/hy3dgen/texgen/utils/simplify_mesh_utils.py
new file mode 100644
index 0000000..5c23999
--- /dev/null
+++ b/hy3dgen/texgen/utils/simplify_mesh_utils.py
@@ -0,0 +1,36 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import trimesh
+
+
+def remesh_mesh(mesh_path, remesh_path, method='trimesh'):
+    if method == 'trimesh':
+        mesh_simplify_trimesh(mesh_path, remesh_path)
+    else:
+        raise f'Method {method} has not been implemented.'
+
+
+def mesh_simplify_trimesh(inputpath, outputpath):
+    import pymeshlab
+    ms = pymeshlab.MeshSet()
+    ms.load_new_mesh(inputpath, load_in_a_single_layer=True)
+    ms.save_current_mesh(outputpath.replace('.glb', '.obj'), save_textures=False)
+
+    courent = trimesh.load(outputpath.replace('.glb', '.obj'), force='mesh')
+    face_num = courent.faces.shape[0]
+
+    if face_num > 100000:
+        courent = courent.simplify_quadric_decimation(40000)
+    courent.export(outputpath)
diff --git a/hy3dgen/texgen/utils/uv_warp_utils.py b/hy3dgen/texgen/utils/uv_warp_utils.py
new file mode 100644
index 0000000..f55a924
--- /dev/null
+++ b/hy3dgen/texgen/utils/uv_warp_utils.py
@@ -0,0 +1,32 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import trimesh
+import xatlas
+
+
+def mesh_uv_wrap(mesh):
+    if isinstance(mesh, trimesh.Scene):
+        mesh = mesh.dump(concatenate=True)
+
+    if len(mesh.faces) > 500000000:
+        raise ValueError("The mesh has more than 500,000,000 faces, which is not supported.")
+
+    vmapping, indices, uvs = xatlas.parametrize(mesh.vertices, mesh.faces)
+
+    mesh.vertices = mesh.vertices[vmapping]
+    mesh.faces = indices
+    mesh.visual.uv = uvs
+
+    return mesh
diff --git a/hy3dgen/text2image.py b/hy3dgen/text2image.py
new file mode 100644
index 0000000..2c8a3ab
--- /dev/null
+++ b/hy3dgen/text2image.py
@@ -0,0 +1,81 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import os
+import random
+
+import numpy as np
+import torch
+from diffusers import AutoPipelineForText2Image
+
+
+def seed_everything(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    os.environ["PL_GLOBAL_SEED"] = str(seed)
+
+
+class HunyuanDiTPipeline:
+    def __init__(
+        self,
+        model_path="Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled",
+        device='cuda'
+    ):
+        self.device = device
+        self.pipe = AutoPipelineForText2Image.from_pretrained(
+            model_path,
+            torch_dtype=torch.float16,
+            enable_pag=True,
+            pag_applied_layers=["blocks.(16|17|18|19)"]
+        ).to(device)
+        self.pos_txt = ",白色背景,3D风格,最佳质量"
+        self.neg_txt = "文本,特写,裁剪,出框,最差质量,低质量,JPEG伪影,PGLY,重复,病态," \
+                       "残缺,多余的手指,变异的手,画得不好的手,画得不好的脸,变异,畸形,模糊,脱水,糟糕的解剖学," \
+                       "糟糕的比例,多余的肢体,克隆的脸,毁容,恶心的比例,畸形的肢体,缺失的手臂,缺失的腿," \
+                       "额外的手臂,额外的腿,融合的手指,手指太多,长脖子"
+
+    def compile(self):
+        # accelarate hunyuan-dit transformer,first inference will cost long time
+        torch.set_float32_matmul_precision('high')
+        self.pipe.transformer = torch.compile(self.pipe.transformer, fullgraph=True)
+        # self.pipe.vae.decode = torch.compile(self.pipe.vae.decode, fullgraph=True)
+        generator = torch.Generator(device=self.pipe.device)  # infer once for hot-start
+        out_img = self.pipe(
+            prompt='美少女战士',
+            negative_prompt='模糊',
+            num_inference_steps=25,
+            pag_scale=1.3,
+            width=1024,
+            height=1024,
+            generator=generator,
+            return_dict=False
+        )[0][0]
+
+    @torch.no_grad()
+    def __call__(self, prompt, seed=0):
+        seed_everything(seed)
+        generator = torch.Generator(device=self.pipe.device)
+        generator = generator.manual_seed(int(seed))
+        out_img = self.pipe(
+            prompt=prompt[:60] + self.pos_txt,
+            negative_prompt=self.neg_txt,
+            num_inference_steps=25,
+            pag_scale=1.3,
+            width=1024,
+            height=1024,
+            generator=generator,
+            return_dict=False
+        )[0][0]
+        return out_img
diff --git a/minimal_demo.py b/minimal_demo.py
new file mode 100644
index 0000000..c268422
--- /dev/null
+++ b/minimal_demo.py
@@ -0,0 +1,33 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+from PIL import Image
+
+from hy3dgen.rembg import BackgroundRemover
+from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline
+from hy3dgen.texgen import Hunyuan3DPaintPipeline
+
+model_path = 'tencent/Hunyuan3D-2'
+pipeline_shapegen = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(model_path)
+pipeline_texgen = Hunyuan3DPaintPipeline.from_pretrained(model_path)
+
+image_path = 'assets/demo.png'
+image = Image.open(image_path).convert("RGBA")
+if image.mode == 'RGB':
+    rembg = BackgroundRemover()
+    image = rembg(image)
+
+mesh = pipeline_shapegen(image=image)[0]
+mesh = pipeline_texgen(mesh, image=image)
+mesh.export('demo.glb')
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..abdab84
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,40 @@
+ninja
+pybind11
+
+diffusers
+einops
+opencv-python
+numpy
+torch
+transformers
+torchvision
+#taming-transformers-rom1504
+#ConfigArgParse
+#ipdb
+omegaconf
+
+#sentencepiece
+tqdm
+
+# Mesh Processing
+trimesh
+pymeshlab
+pygltflib
+xatlas
+#kornia
+#facexlib
+
+# Training
+accelerate
+#pytorch_lightning
+#scikit-learn
+#scikit-image
+
+# Demo only
+gradio
+fastapi
+uvicorn
+rembg
+onnxruntime
+#gevent
+#geventhttpclient
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..5339da1
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,46 @@
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+from setuptools import setup, find_packages
+
+setup(
+    name="hy3dgen",
+    version="2.0.2",
+    url="https://github.com/Tencent/Hunyuan3D-2",
+    packages=find_packages(),
+    include_package_data=True,
+    package_data={"hy3dgen": ["assets/*", "assets/**/*"]},
+    install_requires=[
+        'gradio',
+        "tqdm>=4.66.3",
+        'numpy',
+        'ninja',
+        'diffusers',
+        'pybind11',
+        'opencv-python',
+        'einops',
+        "transformers>=4.48.0",
+        'omegaconf',
+        'trimesh',
+        'pymeshlab',
+        'pygltflib',
+        'xatlas',
+        'accelerate',
+        'gradio',
+        'fastapi',
+        'uvicorn',
+        'rembg',
+        'onnxruntime'
+    ]
+)