commit 53131e903f43fb0522aac89ea649898e9ce61919 Author: Rasul Date: Tue May 6 15:25:37 2025 +0300 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d83c492 --- /dev/null +++ b/.gitignore @@ -0,0 +1,172 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +!hy3dgen/texgen/custom_rasterizer/lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ +.DS_Store +# Cython debug symbols +cython_debug/ +gradio_cache/ +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea +#Docs +*.md +*.pdf diff --git a/api_server.py b/api_server.py new file mode 100644 index 0000000..dd2895b --- /dev/null +++ b/api_server.py @@ -0,0 +1,316 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +""" +A model worker executes the model. +""" +import argparse +import asyncio +import base64 +import logging +import logging.handlers +import os +import sys +import tempfile +import threading +import traceback +import uuid +from io import BytesIO + +import torch +import trimesh +import uvicorn +from PIL import Image +from fastapi import FastAPI, Request +from fastapi.responses import JSONResponse, FileResponse + +from hy3dgen.rembg import BackgroundRemover +from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline, FloaterRemover, DegenerateFaceRemover, FaceReducer, \ + MeshSimplifier +from hy3dgen.texgen import Hunyuan3DPaintPipeline +from hy3dgen.text2image import HunyuanDiTPipeline + +LOGDIR = '.' + +server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" +moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN." + +handler = None + + +def build_logger(logger_name, logger_filename): + global handler + + formatter = logging.Formatter( + fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + + # Set the format of root handlers + if not logging.getLogger().handlers: + logging.basicConfig(level=logging.INFO) + logging.getLogger().handlers[0].setFormatter(formatter) + + # Redirect stdout and stderr to loggers + stdout_logger = logging.getLogger("stdout") + stdout_logger.setLevel(logging.INFO) + sl = StreamToLogger(stdout_logger, logging.INFO) + sys.stdout = sl + + stderr_logger = logging.getLogger("stderr") + stderr_logger.setLevel(logging.ERROR) + sl = StreamToLogger(stderr_logger, logging.ERROR) + sys.stderr = sl + + # Get logger + logger = logging.getLogger(logger_name) + logger.setLevel(logging.INFO) + + # Add a file handler for all loggers + if handler is None: + os.makedirs(LOGDIR, exist_ok=True) + filename = os.path.join(LOGDIR, logger_filename) + handler = logging.handlers.TimedRotatingFileHandler( + filename, when='D', utc=True, encoding='UTF-8') + handler.setFormatter(formatter) + + for name, item in logging.root.manager.loggerDict.items(): + if isinstance(item, logging.Logger): + item.addHandler(handler) + + return logger + + +class StreamToLogger(object): + """ + Fake file-like stream object that redirects writes to a logger instance. + """ + + def __init__(self, logger, log_level=logging.INFO): + self.terminal = sys.stdout + self.logger = logger + self.log_level = log_level + self.linebuf = '' + + def __getattr__(self, attr): + return getattr(self.terminal, attr) + + def write(self, buf): + temp_linebuf = self.linebuf + buf + self.linebuf = '' + for line in temp_linebuf.splitlines(True): + # From the io.TextIOWrapper docs: + # On output, if newline is None, any '\n' characters written + # are translated to the system default line separator. + # By default sys.stdout.write() expects '\n' newlines and then + # translates them so this is still cross platform. + if line[-1] == '\n': + self.logger.log(self.log_level, line.rstrip()) + else: + self.linebuf += line + + def flush(self): + if self.linebuf != '': + self.logger.log(self.log_level, self.linebuf.rstrip()) + self.linebuf = '' + + +def pretty_print_semaphore(semaphore): + if semaphore is None: + return "None" + return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})" + + +SAVE_DIR = 'gradio_cache' +os.makedirs(SAVE_DIR, exist_ok=True) + +worker_id = str(uuid.uuid4())[:6] +logger = build_logger("controller", f"{SAVE_DIR}/controller.log") + + +def load_image_from_base64(image): + return Image.open(BytesIO(base64.b64decode(image))) + + +class ModelWorker: + def __init__(self, + model_path='tencent/Hunyuan3D-2mini', + tex_model_path='tencent/Hunyuan3D-2', + subfolder='hunyuan3d-dit-v2-mini-turbo', + device='cuda', + enable_tex=False): + self.model_path = model_path + self.worker_id = worker_id + self.device = device + logger.info(f"Loading the model {model_path} on worker {worker_id} ...") + + self.rembg = BackgroundRemover() + self.pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained( + model_path, + subfolder=subfolder, + use_safetensors=True, + device=device, + ) + self.pipeline.enable_flashvdm(mc_algo='mc') + # self.pipeline_t2i = HunyuanDiTPipeline( + # 'Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled', + # device=device + # ) + if enable_tex: + self.pipeline_tex = Hunyuan3DPaintPipeline.from_pretrained(tex_model_path) + + def get_queue_length(self): + if model_semaphore is None: + return 0 + else: + return args.limit_model_concurrency - model_semaphore._value + (len( + model_semaphore._waiters) if model_semaphore._waiters is not None else 0) + + def get_status(self): + return { + "speed": 1, + "queue_length": self.get_queue_length(), + } + + @torch.inference_mode() + def generate(self, uid, params): + if 'image' in params: + image = params["image"] + image = load_image_from_base64(image) + else: + if 'text' in params: + text = params["text"] + image = self.pipeline_t2i(text) + else: + raise ValueError("No input image or text provided") + + image = self.rembg(image) + params['image'] = image + + if 'mesh' in params: + mesh = trimesh.load(BytesIO(base64.b64decode(params["mesh"])), file_type='glb') + else: + seed = params.get("seed", 1234) + params['generator'] = torch.Generator(self.device).manual_seed(seed) + params['octree_resolution'] = params.get("octree_resolution", 128) + params['num_inference_steps'] = params.get("num_inference_steps", 5) + params['guidance_scale'] = params.get('guidance_scale', 5.0) + params['mc_algo'] = 'mc' + import time + start_time = time.time() + mesh = self.pipeline(**params)[0] + logger.info("--- %s seconds ---" % (time.time() - start_time)) + + if params.get('texture', False): + mesh = FloaterRemover()(mesh) + mesh = DegenerateFaceRemover()(mesh) + mesh = FaceReducer()(mesh, max_facenum=params.get('face_count', 40000)) + mesh = self.pipeline_tex(mesh, image) + + type = params.get('type', 'glb') + with tempfile.NamedTemporaryFile(suffix=f'.{type}', delete=False) as temp_file: + mesh.export(temp_file.name) + mesh = trimesh.load(temp_file.name) + save_path = os.path.join(SAVE_DIR, f'{str(uid)}.{type}') + mesh.export(save_path) + + torch.cuda.empty_cache() + return save_path, uid + + +app = FastAPI() +from fastapi.middleware.cors import CORSMiddleware + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # 你可以指定允许的来源 + allow_credentials=True, + allow_methods=["*"], # 允许所有方法 + allow_headers=["*"], # 允许所有头部 +) + + +@app.post("/generate") +async def generate(request: Request): + logger.info("Worker generating...") + params = await request.json() + uid = uuid.uuid4() + try: + file_path, uid = worker.generate(uid, params) + return FileResponse(file_path) + except ValueError as e: + traceback.print_exc() + print("Caught ValueError:", e) + ret = { + "text": server_error_msg, + "error_code": 1, + } + return JSONResponse(ret, status_code=404) + except torch.cuda.CudaError as e: + print("Caught torch.cuda.CudaError:", e) + ret = { + "text": server_error_msg, + "error_code": 1, + } + return JSONResponse(ret, status_code=404) + except Exception as e: + print("Caught Unknown Error", e) + traceback.print_exc() + ret = { + "text": server_error_msg, + "error_code": 1, + } + return JSONResponse(ret, status_code=404) + + +@app.post("/send") +async def generate(request: Request): + logger.info("Worker send...") + params = await request.json() + uid = uuid.uuid4() + threading.Thread(target=worker.generate, args=(uid, params,)).start() + ret = {"uid": str(uid)} + return JSONResponse(ret, status_code=200) + + +@app.get("/status/{uid}") +async def status(uid: str): + save_file_path = os.path.join(SAVE_DIR, f'{uid}.glb') + print(save_file_path, os.path.exists(save_file_path)) + if not os.path.exists(save_file_path): + response = {'status': 'processing'} + return JSONResponse(response, status_code=200) + else: + base64_str = base64.b64encode(open(save_file_path, 'rb').read()).decode() + response = {'status': 'completed', 'model_base64': base64_str} + return JSONResponse(response, status_code=200) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="0.0.0.0") + parser.add_argument("--port", type=int, default=8081) + parser.add_argument("--model_path", type=str, default='tencent/Hunyuan3D-2mini') + parser.add_argument("--tex_model_path", type=str, default='tencent/Hunyuan3D-2') + parser.add_argument("--device", type=str, default="cuda") + parser.add_argument("--limit-model-concurrency", type=int, default=5) + parser.add_argument('--enable_tex', action='store_true') + args = parser.parse_args() + logger.info(f"args: {args}") + + model_semaphore = asyncio.Semaphore(args.limit_model_concurrency) + + worker = ModelWorker(model_path=args.model_path, device=args.device, enable_tex=args.enable_tex, + tex_model_path=args.tex_model_path) + uvicorn.run(app, host=args.host, port=args.port, log_level="info") diff --git a/assets/1.glb b/assets/1.glb new file mode 100644 index 0000000..0d33b00 Binary files /dev/null and b/assets/1.glb differ diff --git a/assets/demo.png b/assets/demo.png new file mode 100644 index 0000000..00fda1d Binary files /dev/null and b/assets/demo.png differ diff --git a/assets/env_maps/gradient.jpg b/assets/env_maps/gradient.jpg new file mode 100644 index 0000000..55546c1 Binary files /dev/null and b/assets/env_maps/gradient.jpg differ diff --git a/assets/env_maps/white.jpg b/assets/env_maps/white.jpg new file mode 100644 index 0000000..f7af123 Binary files /dev/null and b/assets/env_maps/white.jpg differ diff --git a/assets/example_images/004.png b/assets/example_images/004.png new file mode 100644 index 0000000..95eb0da Binary files /dev/null and b/assets/example_images/004.png differ diff --git a/assets/example_images/052.png b/assets/example_images/052.png new file mode 100644 index 0000000..685ef05 Binary files /dev/null and b/assets/example_images/052.png differ diff --git a/assets/example_images/073.png b/assets/example_images/073.png new file mode 100644 index 0000000..0390125 Binary files /dev/null and b/assets/example_images/073.png differ diff --git a/assets/example_images/075.png b/assets/example_images/075.png new file mode 100644 index 0000000..1381f75 Binary files /dev/null and b/assets/example_images/075.png differ diff --git a/assets/example_images/1008.png b/assets/example_images/1008.png new file mode 100644 index 0000000..473c933 Binary files /dev/null and b/assets/example_images/1008.png differ diff --git a/assets/example_images/101.png b/assets/example_images/101.png new file mode 100644 index 0000000..b0c5875 Binary files /dev/null and b/assets/example_images/101.png differ diff --git a/assets/example_images/1022.png b/assets/example_images/1022.png new file mode 100644 index 0000000..033fc3d Binary files /dev/null and b/assets/example_images/1022.png differ diff --git a/assets/example_images/1029.png b/assets/example_images/1029.png new file mode 100644 index 0000000..d3b13cd Binary files /dev/null and b/assets/example_images/1029.png differ diff --git a/assets/example_images/1037.png b/assets/example_images/1037.png new file mode 100644 index 0000000..e2ac72e Binary files /dev/null and b/assets/example_images/1037.png differ diff --git a/assets/example_images/1079.png b/assets/example_images/1079.png new file mode 100644 index 0000000..0398f6b Binary files /dev/null and b/assets/example_images/1079.png differ diff --git a/assets/example_images/1111.png b/assets/example_images/1111.png new file mode 100644 index 0000000..ea24af1 Binary files /dev/null and b/assets/example_images/1111.png differ diff --git a/assets/example_images/1123.png b/assets/example_images/1123.png new file mode 100644 index 0000000..71e862b Binary files /dev/null and b/assets/example_images/1123.png differ diff --git a/assets/example_images/1128.png b/assets/example_images/1128.png new file mode 100644 index 0000000..f04d30d Binary files /dev/null and b/assets/example_images/1128.png differ diff --git a/assets/example_images/1135.png b/assets/example_images/1135.png new file mode 100644 index 0000000..e4242dc Binary files /dev/null and b/assets/example_images/1135.png differ diff --git a/assets/example_images/1146.png b/assets/example_images/1146.png new file mode 100644 index 0000000..d9541b6 Binary files /dev/null and b/assets/example_images/1146.png differ diff --git a/assets/example_images/1148.png b/assets/example_images/1148.png new file mode 100644 index 0000000..2d76c9e Binary files /dev/null and b/assets/example_images/1148.png differ diff --git a/assets/example_images/1154.png b/assets/example_images/1154.png new file mode 100644 index 0000000..2ab169c Binary files /dev/null and b/assets/example_images/1154.png differ diff --git a/assets/example_images/1180.png b/assets/example_images/1180.png new file mode 100644 index 0000000..1bf552b Binary files /dev/null and b/assets/example_images/1180.png differ diff --git a/assets/example_images/1196.png b/assets/example_images/1196.png new file mode 100644 index 0000000..c7c4e9d Binary files /dev/null and b/assets/example_images/1196.png differ diff --git a/assets/example_images/1204.png b/assets/example_images/1204.png new file mode 100644 index 0000000..569dd9e Binary files /dev/null and b/assets/example_images/1204.png differ diff --git a/assets/example_images/1234.png b/assets/example_images/1234.png new file mode 100644 index 0000000..105a9a3 Binary files /dev/null and b/assets/example_images/1234.png differ diff --git a/assets/example_images/1310.png b/assets/example_images/1310.png new file mode 100644 index 0000000..42f5fdd Binary files /dev/null and b/assets/example_images/1310.png differ diff --git a/assets/example_images/1316.png b/assets/example_images/1316.png new file mode 100644 index 0000000..a3f2902 Binary files /dev/null and b/assets/example_images/1316.png differ diff --git a/assets/example_images/1354.png b/assets/example_images/1354.png new file mode 100644 index 0000000..685c11c Binary files /dev/null and b/assets/example_images/1354.png differ diff --git a/assets/example_images/1429.png b/assets/example_images/1429.png new file mode 100644 index 0000000..976d1a4 Binary files /dev/null and b/assets/example_images/1429.png differ diff --git a/assets/example_images/1493.png b/assets/example_images/1493.png new file mode 100644 index 0000000..dd1e979 Binary files /dev/null and b/assets/example_images/1493.png differ diff --git a/assets/example_images/1582.png b/assets/example_images/1582.png new file mode 100644 index 0000000..e67ed66 Binary files /dev/null and b/assets/example_images/1582.png differ diff --git a/assets/example_images/1583.png b/assets/example_images/1583.png new file mode 100644 index 0000000..01f6a48 Binary files /dev/null and b/assets/example_images/1583.png differ diff --git a/assets/example_images/1596.png b/assets/example_images/1596.png new file mode 100644 index 0000000..55d3970 Binary files /dev/null and b/assets/example_images/1596.png differ diff --git a/assets/example_images/1601.png b/assets/example_images/1601.png new file mode 100644 index 0000000..e3bdbbd Binary files /dev/null and b/assets/example_images/1601.png differ diff --git a/assets/example_images/1603.png b/assets/example_images/1603.png new file mode 100644 index 0000000..8f2eb53 Binary files /dev/null and b/assets/example_images/1603.png differ diff --git a/assets/example_images/1626.png b/assets/example_images/1626.png new file mode 100644 index 0000000..faa2f73 Binary files /dev/null and b/assets/example_images/1626.png differ diff --git a/assets/example_images/1627.png b/assets/example_images/1627.png new file mode 100644 index 0000000..000c9ab Binary files /dev/null and b/assets/example_images/1627.png differ diff --git a/assets/example_images/1654.png b/assets/example_images/1654.png new file mode 100644 index 0000000..2385031 Binary files /dev/null and b/assets/example_images/1654.png differ diff --git a/assets/example_images/167.png b/assets/example_images/167.png new file mode 100644 index 0000000..ab59d39 Binary files /dev/null and b/assets/example_images/167.png differ diff --git a/assets/example_images/1670.png b/assets/example_images/1670.png new file mode 100644 index 0000000..c6d7157 Binary files /dev/null and b/assets/example_images/1670.png differ diff --git a/assets/example_images/1679.png b/assets/example_images/1679.png new file mode 100644 index 0000000..ce14585 Binary files /dev/null and b/assets/example_images/1679.png differ diff --git a/assets/example_images/1687.png b/assets/example_images/1687.png new file mode 100644 index 0000000..90d406c Binary files /dev/null and b/assets/example_images/1687.png differ diff --git a/assets/example_images/1698.png b/assets/example_images/1698.png new file mode 100644 index 0000000..91e7032 Binary files /dev/null and b/assets/example_images/1698.png differ diff --git a/assets/example_images/1715.png b/assets/example_images/1715.png new file mode 100644 index 0000000..2ee44da Binary files /dev/null and b/assets/example_images/1715.png differ diff --git a/assets/example_images/1735.png b/assets/example_images/1735.png new file mode 100644 index 0000000..a7a722c Binary files /dev/null and b/assets/example_images/1735.png differ diff --git a/assets/example_images/1738.png b/assets/example_images/1738.png new file mode 100644 index 0000000..50d4020 Binary files /dev/null and b/assets/example_images/1738.png differ diff --git a/assets/example_images/1744.png b/assets/example_images/1744.png new file mode 100644 index 0000000..767f820 Binary files /dev/null and b/assets/example_images/1744.png differ diff --git a/assets/example_images/1758.png b/assets/example_images/1758.png new file mode 100644 index 0000000..aafb219 Binary files /dev/null and b/assets/example_images/1758.png differ diff --git a/assets/example_images/1772.png b/assets/example_images/1772.png new file mode 100644 index 0000000..17bcfff Binary files /dev/null and b/assets/example_images/1772.png differ diff --git a/assets/example_images/1773.png b/assets/example_images/1773.png new file mode 100644 index 0000000..1ffc05d Binary files /dev/null and b/assets/example_images/1773.png differ diff --git a/assets/example_images/1778.png b/assets/example_images/1778.png new file mode 100644 index 0000000..4819163 Binary files /dev/null and b/assets/example_images/1778.png differ diff --git a/assets/example_images/179.png b/assets/example_images/179.png new file mode 100644 index 0000000..a2d4160 Binary files /dev/null and b/assets/example_images/179.png differ diff --git a/assets/example_images/1898.png b/assets/example_images/1898.png new file mode 100644 index 0000000..af43116 Binary files /dev/null and b/assets/example_images/1898.png differ diff --git a/assets/example_images/191.png b/assets/example_images/191.png new file mode 100644 index 0000000..0d342eb Binary files /dev/null and b/assets/example_images/191.png differ diff --git a/assets/example_images/195.png b/assets/example_images/195.png new file mode 100644 index 0000000..530444f Binary files /dev/null and b/assets/example_images/195.png differ diff --git a/assets/example_images/197.png b/assets/example_images/197.png new file mode 100644 index 0000000..b23b422 Binary files /dev/null and b/assets/example_images/197.png differ diff --git a/assets/example_images/198.png b/assets/example_images/198.png new file mode 100644 index 0000000..3215f00 Binary files /dev/null and b/assets/example_images/198.png differ diff --git a/assets/example_images/202.png b/assets/example_images/202.png new file mode 100644 index 0000000..a1389c8 Binary files /dev/null and b/assets/example_images/202.png differ diff --git a/assets/example_images/203.png b/assets/example_images/203.png new file mode 100644 index 0000000..a45b38e Binary files /dev/null and b/assets/example_images/203.png differ diff --git a/assets/example_images/218.png b/assets/example_images/218.png new file mode 100644 index 0000000..f8f9b29 Binary files /dev/null and b/assets/example_images/218.png differ diff --git a/assets/example_images/219.png b/assets/example_images/219.png new file mode 100644 index 0000000..61369cf Binary files /dev/null and b/assets/example_images/219.png differ diff --git a/assets/example_images/379.png b/assets/example_images/379.png new file mode 100644 index 0000000..0728d83 Binary files /dev/null and b/assets/example_images/379.png differ diff --git a/assets/example_images/380.png b/assets/example_images/380.png new file mode 100644 index 0000000..084a4c7 Binary files /dev/null and b/assets/example_images/380.png differ diff --git a/assets/example_images/419.png b/assets/example_images/419.png new file mode 100644 index 0000000..8e7cec9 Binary files /dev/null and b/assets/example_images/419.png differ diff --git a/assets/example_images/583.png b/assets/example_images/583.png new file mode 100644 index 0000000..c303211 Binary files /dev/null and b/assets/example_images/583.png differ diff --git a/assets/example_images/888.png b/assets/example_images/888.png new file mode 100644 index 0000000..185a4ce Binary files /dev/null and b/assets/example_images/888.png differ diff --git a/assets/example_images/895.png b/assets/example_images/895.png new file mode 100644 index 0000000..cf29d13 Binary files /dev/null and b/assets/example_images/895.png differ diff --git a/assets/example_images/example_000.png b/assets/example_images/example_000.png new file mode 100644 index 0000000..6222237 Binary files /dev/null and b/assets/example_images/example_000.png differ diff --git a/assets/example_images/example_002.png b/assets/example_images/example_002.png new file mode 100644 index 0000000..a6fd2a5 Binary files /dev/null and b/assets/example_images/example_002.png differ diff --git a/assets/example_mv_images/1/back.png b/assets/example_mv_images/1/back.png new file mode 100644 index 0000000..b4e0509 Binary files /dev/null and b/assets/example_mv_images/1/back.png differ diff --git a/assets/example_mv_images/1/front.png b/assets/example_mv_images/1/front.png new file mode 100644 index 0000000..1417f8c Binary files /dev/null and b/assets/example_mv_images/1/front.png differ diff --git a/assets/example_mv_images/1/left.png b/assets/example_mv_images/1/left.png new file mode 100644 index 0000000..ba76d31 Binary files /dev/null and b/assets/example_mv_images/1/left.png differ diff --git a/assets/example_mv_images/10/back.png b/assets/example_mv_images/10/back.png new file mode 100644 index 0000000..eef6ab0 Binary files /dev/null and b/assets/example_mv_images/10/back.png differ diff --git a/assets/example_mv_images/10/front.png b/assets/example_mv_images/10/front.png new file mode 100644 index 0000000..dda89b7 Binary files /dev/null and b/assets/example_mv_images/10/front.png differ diff --git a/assets/example_mv_images/10/left.png b/assets/example_mv_images/10/left.png new file mode 100644 index 0000000..e0579df Binary files /dev/null and b/assets/example_mv_images/10/left.png differ diff --git a/assets/example_mv_images/11/back.png b/assets/example_mv_images/11/back.png new file mode 100644 index 0000000..b586caf Binary files /dev/null and b/assets/example_mv_images/11/back.png differ diff --git a/assets/example_mv_images/11/front.png b/assets/example_mv_images/11/front.png new file mode 100644 index 0000000..595f9d6 Binary files /dev/null and b/assets/example_mv_images/11/front.png differ diff --git a/assets/example_mv_images/11/left.png b/assets/example_mv_images/11/left.png new file mode 100644 index 0000000..e83eccf Binary files /dev/null and b/assets/example_mv_images/11/left.png differ diff --git a/assets/example_mv_images/12/back.png b/assets/example_mv_images/12/back.png new file mode 100644 index 0000000..c49e0fc Binary files /dev/null and b/assets/example_mv_images/12/back.png differ diff --git a/assets/example_mv_images/12/front.png b/assets/example_mv_images/12/front.png new file mode 100644 index 0000000..148cd51 Binary files /dev/null and b/assets/example_mv_images/12/front.png differ diff --git a/assets/example_mv_images/12/left.png b/assets/example_mv_images/12/left.png new file mode 100644 index 0000000..4fbdb35 Binary files /dev/null and b/assets/example_mv_images/12/left.png differ diff --git a/assets/example_mv_images/13/back.png b/assets/example_mv_images/13/back.png new file mode 100644 index 0000000..26685f5 Binary files /dev/null and b/assets/example_mv_images/13/back.png differ diff --git a/assets/example_mv_images/13/front.png b/assets/example_mv_images/13/front.png new file mode 100644 index 0000000..95053ac Binary files /dev/null and b/assets/example_mv_images/13/front.png differ diff --git a/assets/example_mv_images/13/left.png b/assets/example_mv_images/13/left.png new file mode 100644 index 0000000..34fe663 Binary files /dev/null and b/assets/example_mv_images/13/left.png differ diff --git a/assets/example_mv_images/14/back.png b/assets/example_mv_images/14/back.png new file mode 100644 index 0000000..1a48313 Binary files /dev/null and b/assets/example_mv_images/14/back.png differ diff --git a/assets/example_mv_images/14/front.png b/assets/example_mv_images/14/front.png new file mode 100644 index 0000000..3b58dfa Binary files /dev/null and b/assets/example_mv_images/14/front.png differ diff --git a/assets/example_mv_images/14/left.png b/assets/example_mv_images/14/left.png new file mode 100644 index 0000000..9842b3a Binary files /dev/null and b/assets/example_mv_images/14/left.png differ diff --git a/assets/example_mv_images/2/back.png b/assets/example_mv_images/2/back.png new file mode 100644 index 0000000..88a0513 Binary files /dev/null and b/assets/example_mv_images/2/back.png differ diff --git a/assets/example_mv_images/2/front.png b/assets/example_mv_images/2/front.png new file mode 100644 index 0000000..35c55ba Binary files /dev/null and b/assets/example_mv_images/2/front.png differ diff --git a/assets/example_mv_images/2/left.png b/assets/example_mv_images/2/left.png new file mode 100644 index 0000000..bd47d66 Binary files /dev/null and b/assets/example_mv_images/2/left.png differ diff --git a/assets/example_mv_images/3/back.png b/assets/example_mv_images/3/back.png new file mode 100644 index 0000000..98185fe Binary files /dev/null and b/assets/example_mv_images/3/back.png differ diff --git a/assets/example_mv_images/3/front.png b/assets/example_mv_images/3/front.png new file mode 100644 index 0000000..1265af6 Binary files /dev/null and b/assets/example_mv_images/3/front.png differ diff --git a/assets/example_mv_images/3/left.png b/assets/example_mv_images/3/left.png new file mode 100644 index 0000000..df83c19 Binary files /dev/null and b/assets/example_mv_images/3/left.png differ diff --git a/assets/example_mv_images/4/back.png b/assets/example_mv_images/4/back.png new file mode 100644 index 0000000..c818617 Binary files /dev/null and b/assets/example_mv_images/4/back.png differ diff --git a/assets/example_mv_images/4/front.png b/assets/example_mv_images/4/front.png new file mode 100644 index 0000000..8758fd6 Binary files /dev/null and b/assets/example_mv_images/4/front.png differ diff --git a/assets/example_mv_images/4/left.png b/assets/example_mv_images/4/left.png new file mode 100644 index 0000000..584be7f Binary files /dev/null and b/assets/example_mv_images/4/left.png differ diff --git a/assets/example_mv_images/5/back.png b/assets/example_mv_images/5/back.png new file mode 100644 index 0000000..71e53e1 Binary files /dev/null and b/assets/example_mv_images/5/back.png differ diff --git a/assets/example_mv_images/5/front.png b/assets/example_mv_images/5/front.png new file mode 100644 index 0000000..041f4ac Binary files /dev/null and b/assets/example_mv_images/5/front.png differ diff --git a/assets/example_mv_images/5/left.png b/assets/example_mv_images/5/left.png new file mode 100644 index 0000000..2337b26 Binary files /dev/null and b/assets/example_mv_images/5/left.png differ diff --git a/assets/example_mv_images/6/back.png b/assets/example_mv_images/6/back.png new file mode 100644 index 0000000..6ceb5d8 Binary files /dev/null and b/assets/example_mv_images/6/back.png differ diff --git a/assets/example_mv_images/6/front.png b/assets/example_mv_images/6/front.png new file mode 100644 index 0000000..95fc2c0 Binary files /dev/null and b/assets/example_mv_images/6/front.png differ diff --git a/assets/example_mv_images/6/left.png b/assets/example_mv_images/6/left.png new file mode 100644 index 0000000..944a731 Binary files /dev/null and b/assets/example_mv_images/6/left.png differ diff --git a/assets/example_mv_images/7/back.png b/assets/example_mv_images/7/back.png new file mode 100644 index 0000000..5ef772d Binary files /dev/null and b/assets/example_mv_images/7/back.png differ diff --git a/assets/example_mv_images/7/front.png b/assets/example_mv_images/7/front.png new file mode 100644 index 0000000..01b20d8 Binary files /dev/null and b/assets/example_mv_images/7/front.png differ diff --git a/assets/example_mv_images/7/left.png b/assets/example_mv_images/7/left.png new file mode 100644 index 0000000..bfa778a Binary files /dev/null and b/assets/example_mv_images/7/left.png differ diff --git a/assets/example_mv_images/8/back.png b/assets/example_mv_images/8/back.png new file mode 100644 index 0000000..d1d6b9d Binary files /dev/null and b/assets/example_mv_images/8/back.png differ diff --git a/assets/example_mv_images/8/front.png b/assets/example_mv_images/8/front.png new file mode 100644 index 0000000..9e3c6d8 Binary files /dev/null and b/assets/example_mv_images/8/front.png differ diff --git a/assets/example_mv_images/8/left.png b/assets/example_mv_images/8/left.png new file mode 100644 index 0000000..2aeb68a Binary files /dev/null and b/assets/example_mv_images/8/left.png differ diff --git a/assets/example_mv_images/9/back.png b/assets/example_mv_images/9/back.png new file mode 100644 index 0000000..e35be4b Binary files /dev/null and b/assets/example_mv_images/9/back.png differ diff --git a/assets/example_mv_images/9/front.png b/assets/example_mv_images/9/front.png new file mode 100644 index 0000000..c73d819 Binary files /dev/null and b/assets/example_mv_images/9/front.png differ diff --git a/assets/example_mv_images/9/left.png b/assets/example_mv_images/9/left.png new file mode 100644 index 0000000..4a2736c Binary files /dev/null and b/assets/example_mv_images/9/left.png differ diff --git a/assets/example_prompts.txt b/assets/example_prompts.txt new file mode 100644 index 0000000..5155022 --- /dev/null +++ b/assets/example_prompts.txt @@ -0,0 +1,5 @@ +一片绿色的树叶在白色背景上居中展现,清晰的纹理 +一只棕白相间的仓鼠,站在白色背景前。照片采用居中构图方式,卡通风格 +一盆绿色植物生长在红色花盆中,居中,写实 +a pot of green plants grows in a red flower pot. +a lovely rabbit eating carrots diff --git a/assets/images/arch.jpg b/assets/images/arch.jpg new file mode 100644 index 0000000..c2e608a Binary files /dev/null and b/assets/images/arch.jpg differ diff --git a/assets/images/e2e-1.gif b/assets/images/e2e-1.gif new file mode 100644 index 0000000..a79e112 Binary files /dev/null and b/assets/images/e2e-1.gif differ diff --git a/assets/images/e2e-2.gif b/assets/images/e2e-2.gif new file mode 100644 index 0000000..1653f8a Binary files /dev/null and b/assets/images/e2e-2.gif differ diff --git a/assets/images/system.jpg b/assets/images/system.jpg new file mode 100644 index 0000000..8ab500b Binary files /dev/null and b/assets/images/system.jpg differ diff --git a/assets/images/teaser.jpg b/assets/images/teaser.jpg new file mode 100644 index 0000000..6992a0d Binary files /dev/null and b/assets/images/teaser.jpg differ diff --git a/assets/images/teaser_wo_logo.jpg b/assets/images/teaser_wo_logo.jpg new file mode 100644 index 0000000..ee724da Binary files /dev/null and b/assets/images/teaser_wo_logo.jpg differ diff --git a/assets/modelviewer-template.html b/assets/modelviewer-template.html new file mode 100644 index 0000000..3406cb1 --- /dev/null +++ b/assets/modelviewer-template.html @@ -0,0 +1,81 @@ + + + + + + + + + + + + + +
+
+ + +
+
+ + + \ No newline at end of file diff --git a/assets/modelviewer-textured-template.html b/assets/modelviewer-textured-template.html new file mode 100644 index 0000000..5f84cae --- /dev/null +++ b/assets/modelviewer-textured-template.html @@ -0,0 +1,136 @@ + + + + + + + + + + +
+
+
+ + +
+ +
+
+ Appearance +
+
Geometry
+
+
+
+ + + + + \ No newline at end of file diff --git a/assets/qrcode/discord.png b/assets/qrcode/discord.png new file mode 100644 index 0000000..a9c326d Binary files /dev/null and b/assets/qrcode/discord.png differ diff --git a/assets/qrcode/wechat.png b/assets/qrcode/wechat.png new file mode 100644 index 0000000..4f25092 Binary files /dev/null and b/assets/qrcode/wechat.png differ diff --git a/assets/qrcode/x.png b/assets/qrcode/x.png new file mode 100644 index 0000000..e9f9044 Binary files /dev/null and b/assets/qrcode/x.png differ diff --git a/assets/qrcode/xiaohongshu.png b/assets/qrcode/xiaohongshu.png new file mode 100644 index 0000000..7ace644 Binary files /dev/null and b/assets/qrcode/xiaohongshu.png differ diff --git a/blender_addon.py b/blender_addon.py new file mode 100644 index 0000000..149745c --- /dev/null +++ b/blender_addon.py @@ -0,0 +1,347 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +bl_info = { + "name": "Hunyuan3D-2 Generator", + "author": "Tencent Hunyuan3D", + "version": (1, 0), + "blender": (3, 0, 0), + "location": "View3D > Sidebar > Hunyuan3D-2 3D Generator", + "description": "Generate/Texturing 3D models from text descriptions or images", + "category": "3D View", +} +import base64 +import os +import tempfile +import threading + +import bpy +import requests +from bpy.props import StringProperty, BoolProperty, IntProperty, FloatProperty + + +class Hunyuan3DProperties(bpy.types.PropertyGroup): + prompt: StringProperty( + name="Text Prompt", + description="Describe what you want to generate", + default="" + ) + api_url: StringProperty( + name="API URL", + description="URL of the Text-to-3D API service", + default="http://localhost:8080" + ) + is_processing: BoolProperty( + name="Processing", + default=False + ) + job_id: StringProperty( + name="Job ID", + default="" + ) + status_message: StringProperty( + name="Status Message", + default="" + ) + # 添加图片路径属性 + image_path: StringProperty( + name="Image", + description="Select an image to upload", + subtype='FILE_PATH' + ) + # 修改后的 octree_resolution 属性 + octree_resolution: IntProperty( + name="Octree Resolution", + description="Octree resolution for the 3D generation", + default=256, + min=128, + max=512, + ) + num_inference_steps: IntProperty( + name="Number of Inference Steps", + description="Number of inference steps for the 3D generation", + default=20, + min=20, + max=50 + ) + guidance_scale: FloatProperty( + name="Guidance Scale", + description="Guidance scale for the 3D generation", + default=5.5, + min=1.0, + max=10.0 + ) + # 添加 texture 属性 + texture: BoolProperty( + name="Generate Texture", + description="Whether to generate texture for the 3D model", + default=False + ) + + +class Hunyuan3DOperator(bpy.types.Operator): + bl_idname = "object.generate_3d" + bl_label = "Generate 3D Model" + bl_description = "Generate a 3D model from text description, an image or a selected mesh" + + job_id = '' + prompt = "" + api_url = "" + image_path = "" + octree_resolution = 256 + num_inference_steps = 20 + guidance_scale = 5.5 + texture = False # 新增属性 + selected_mesh_base64 = "" + selected_mesh = None # 新增属性,用于存储选中的 mesh + + thread = None + task_finished = False + + def modal(self, context, event): + if event.type in {'RIGHTMOUSE', 'ESC'}: + return {'CANCELLED'} + + if self.task_finished: + print("Threaded task completed") + self.task_finished = False + props = context.scene.gen_3d_props + props.is_processing = False + + return {'PASS_THROUGH'} + + def invoke(self, context, event): + # 启动线程 + props = context.scene.gen_3d_props + self.prompt = props.prompt + self.api_url = props.api_url + self.image_path = props.image_path + self.octree_resolution = props.octree_resolution + self.num_inference_steps = props.num_inference_steps + self.guidance_scale = props.guidance_scale + self.texture = props.texture # 获取 texture 属性的值 + + if self.prompt == "" and self.image_path == "": + self.report({'WARNING'}, "Please enter some text or select an image first.") + return {'FINISHED'} + + # 保存选中的 mesh 对象引用 + for obj in context.selected_objects: + if obj.type == 'MESH': + self.selected_mesh = obj + break + + if self.selected_mesh: + temp_glb_file = tempfile.NamedTemporaryFile(delete=False, suffix=".glb") + temp_glb_file.close() + bpy.ops.export_scene.gltf(filepath=temp_glb_file.name, use_selection=True) + with open(temp_glb_file.name, "rb") as file: + mesh_data = file.read() + mesh_b64_str = base64.b64encode(mesh_data).decode() + os.unlink(temp_glb_file.name) + self.selected_mesh_base64 = mesh_b64_str + + props.is_processing = True + + # 将相对路径转换为相对于 Blender 文件所在目录的绝对路径 + blend_file_dir = os.path.dirname(bpy.data.filepath) + self.report({'INFO'}, f"blend_file_dir {blend_file_dir}") + self.report({'INFO'}, f"image_path {self.image_path}") + if self.image_path.startswith('//'): + self.image_path = self.image_path[2:] + self.image_path = os.path.join(blend_file_dir, self.image_path) + + if self.selected_mesh and self.texture: + props.status_message = "Texturing Selected Mesh...\n" \ + "This may take several minutes depending \n on your GPU power." + else: + mesh_type = 'Textured Mesh' if self.texture else 'White Mesh' + prompt_type = 'Text Prompt' if self.prompt else 'Image' + props.status_message = f"Generating {mesh_type} with {prompt_type}...\n" \ + "This may take several minutes depending \n on your GPU power." + + self.thread = threading.Thread(target=self.generate_model) + self.thread.start() + + wm = context.window_manager + wm.modal_handler_add(self) + return {'RUNNING_MODAL'} + + def generate_model(self): + self.report({'INFO'}, f"Generation Start") + base_url = self.api_url.rstrip('/') + + try: + if self.selected_mesh_base64 and self.texture: + # Texturing the selected mesh + if self.image_path and os.path.exists(self.image_path): + self.report({'INFO'}, f"Post Texturing with Image") + # 打开图片文件并以二进制模式读取 + with open(self.image_path, "rb") as file: + # 读取文件内容 + image_data = file.read() + # 对图片数据进行 Base64 编码 + img_b64_str = base64.b64encode(image_data).decode() + response = requests.post( + f"{base_url}/generate", + json={ + "mesh": self.selected_mesh_base64, + "image": img_b64_str, + "octree_resolution": self.octree_resolution, + "num_inference_steps": self.num_inference_steps, + "guidance_scale": self.guidance_scale, + "texture": self.texture # 传递 texture 参数 + }, + ) + else: + self.report({'INFO'}, f"Post Texturing with Text") + response = requests.post( + f"{base_url}/generate", + json={ + "mesh": self.selected_mesh_base64, + "text": self.prompt, + "octree_resolution": self.octree_resolution, + "num_inference_steps": self.num_inference_steps, + "guidance_scale": self.guidance_scale, + "texture": self.texture # 传递 texture 参数 + }, + ) + else: + if self.image_path: + if not os.path.exists(self.image_path): + self.report({'ERROR'}, f"Image path does not exist {self.image_path}") + raise Exception(f'Image path does not exist {self.image_path}') + self.report({'INFO'}, f"Post Start Image to 3D") + # 打开图片文件并以二进制模式读取 + with open(self.image_path, "rb") as file: + # 读取文件内容 + image_data = file.read() + # 对图片数据进行 Base64 编码 + img_b64_str = base64.b64encode(image_data).decode() + response = requests.post( + f"{base_url}/generate", + json={ + "image": img_b64_str, + "octree_resolution": self.octree_resolution, + "num_inference_steps": self.num_inference_steps, + "guidance_scale": self.guidance_scale, + "texture": self.texture # 传递 texture 参数 + }, + ) + else: + self.report({'INFO'}, f"Post Start Text to 3D") + response = requests.post( + f"{base_url}/generate", + json={ + "text": self.prompt, + "octree_resolution": self.octree_resolution, + "num_inference_steps": self.num_inference_steps, + "guidance_scale": self.guidance_scale, + "texture": self.texture # 传递 texture 参数 + }, + ) + self.report({'INFO'}, f"Post Done") + + if response.status_code != 200: + self.report({'ERROR'}, f"Generation failed: {response.text}") + return + + # Decode base64 and save to temporary file + temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".glb") + temp_file.write(response.content) + temp_file.close() + + # Import the GLB file in the main thread + def import_handler(): + bpy.ops.import_scene.gltf(filepath=temp_file.name) + os.unlink(temp_file.name) + + # 获取新导入的 mesh + new_obj = bpy.context.selected_objects[0] if bpy.context.selected_objects else None + if new_obj and self.selected_mesh and self.texture: + # 应用选中 mesh 的位置、旋转和缩放 + new_obj.location = self.selected_mesh.location + new_obj.rotation_euler = self.selected_mesh.rotation_euler + new_obj.scale = self.selected_mesh.scale + + # 隐藏原来的 mesh + self.selected_mesh.hide_set(True) + self.selected_mesh.hide_render = True + + return None + + bpy.app.timers.register(import_handler) + + except Exception as e: + self.report({'ERROR'}, f"Error: {str(e)}") + + finally: + self.task_finished = True + self.selected_mesh_base64 = "" + + +class Hunyuan3DPanel(bpy.types.Panel): + bl_space_type = 'VIEW_3D' + bl_region_type = 'UI' + bl_category = 'Hunyuan3D-2' + bl_label = 'Hunyuan3D-2 3D Generator' + + def draw(self, context): + layout = self.layout + props = context.scene.gen_3d_props + + layout.prop(props, "api_url") + layout.prop(props, "prompt") + # 添加图片选择器 + layout.prop(props, "image_path") + # 添加新属性的 UI 元素 + layout.prop(props, "octree_resolution") + layout.prop(props, "num_inference_steps") + layout.prop(props, "guidance_scale") + # 添加 texture 属性的 UI 元素 + layout.prop(props, "texture") + + row = layout.row() + row.enabled = not props.is_processing + row.operator("object.generate_3d") + + if props.is_processing: + if props.status_message: + for line in props.status_message.split("\n"): + layout.label(text=line) + else: + layout.label("Processing...") + + +classes = ( + Hunyuan3DProperties, + Hunyuan3DOperator, + Hunyuan3DPanel, +) + + +def register(): + for cls in classes: + bpy.utils.register_class(cls) + bpy.types.Scene.gen_3d_props = bpy.props.PointerProperty(type=Hunyuan3DProperties) + + +def unregister(): + for cls in reversed(classes): + bpy.utils.unregister_class(cls) + del bpy.types.Scene.gen_3d_props + + +if __name__ == "__main__": + register() diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..061f32f --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..bdcdd60 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,25 @@ +myst-parser +sphinx-rtd-theme +furo +sphinx-copybutton +sphinx-inline-tabs +nbsphinx +nbsphinx_link +linkify-it-py +linkify +ipython + +torch +imageio +scikit_image +matplotlib +munch +tfpnp +cvxpy +torchlights +tensorboardX +termcolor +proximal +opencv-python +huggingface_hub +torchvision \ No newline at end of file diff --git a/docs/source/_static/brand.png b/docs/source/_static/brand.png new file mode 100644 index 0000000..7e7528a Binary files /dev/null and b/docs/source/_static/brand.png differ diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css new file mode 100644 index 0000000..05a0d46 --- /dev/null +++ b/docs/source/_static/css/custom.css @@ -0,0 +1,38 @@ + +/*.sidebar-logo {*/ +/* display: block;*/ +/* margin: 0;*/ +/* max-width: 50%;*/ +/*}*/ + +.nbsphinx-gallery { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(200px, 1fr)); + gap: 5px; + margin-top: 1em; + margin-bottom: 1em; +} + +h1 { + font-size: 2em +} + +h2 { + font-size: 1.3em +} + +h3 { + font-size: 1.25em +} + +h4 { + font-size: 1.125em +} + +h5 { + font-size: 1.07em +} + +h6 { + font-size: 1em +} \ No newline at end of file diff --git a/docs/source/_static/favicon.ico b/docs/source/_static/favicon.ico new file mode 100644 index 0000000..927560b Binary files /dev/null and b/docs/source/_static/favicon.ico differ diff --git a/docs/source/_static/image/example_deconv.png b/docs/source/_static/image/example_deconv.png new file mode 100644 index 0000000..00d8085 Binary files /dev/null and b/docs/source/_static/image/example_deconv.png differ diff --git a/docs/source/_static/image/optic_results.png b/docs/source/_static/image/optic_results.png new file mode 100644 index 0000000..fb0bc49 Binary files /dev/null and b/docs/source/_static/image/optic_results.png differ diff --git a/docs/source/_static/image/psf.png b/docs/source/_static/image/psf.png new file mode 100644 index 0000000..2197af7 Binary files /dev/null and b/docs/source/_static/image/psf.png differ diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..5e22fb8 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,141 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import os +import sys + +sys.path.insert(0, os.path.abspath(".")) +sys.path.insert(0, os.path.abspath("../../")) + +# -- Project information ----------------------------------------------------- + +project = 'Hunyuan3D-2' +copyright = '2025, Tencent Hunyuan3D' +author = 'Hunyuan3D Team' + +# The full version, including alpha/beta/rc tags +release = '0.0.1' + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'myst_parser', + 'nbsphinx', + 'nbsphinx_link', + # "myst_nb", + 'sphinx_copybutton', + # "sphinx_inline_tabs", + # https://sphinx-codeautolink.readthedocs.io/en/latest/examples.html + 'sphinx.ext.autodoc', + "sphinx.ext.intersphinx", + "sphinx.ext.extlinks", + 'sphinx.ext.autosummary', + 'sphinx.ext.doctest', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.mathjax', + 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', +] + +# -- Options for extlinks ---------------------------------------------------- +# + +extlinks = { + "pypi": ("https://pypi.org/project/%s/", "%s"), +} + +# -- Options for intersphinx ------------------------------------------------- +# + +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "sphinx": ("https://www.sphinx-doc.org/en/master", None), + 'torch': ('https://pytorch.org/docs/master/', None) +} + +napoleon_preprocess_types = True + +myst_enable_extensions = [ + "amsmath", + "colon_fence", + "deflist", + "dollarmath", + "fieldlist", + "html_admonition", + "html_image", + "linkify", + "replacements", + "smartquotes", + "strikethrough", + "substitution", + "tasklist", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +# html_theme = 'alabaster' +# html_theme = 'sphinx_rtd_theme' +html_theme = "furo" +html_title = "Hunyuan3D-2" +language = "en" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +html_theme_options = { + "light_css_variables": { + "font-stack": "Arial,Noto Sans,sans-serif", + "font-stack--monospace": "IBM Plex Mono,ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace", + }, + "announcement": 'Release 🤗Turbo Series and FlashVDM, Fast Shape Generation within 1 Second Right Now!', +} + +# +# -- Options for TODOs ------------------------------------------------------- +# +todo_include_todos = True + +# +# -- Options for Markdown files ---------------------------------------------- +# +myst_admonition_enable = True +myst_deflist_enable = True +myst_heading_anchors = 3 + +html_favicon = '_static/favicon.ico' + +pygments_style = "default" +pygments_dark_style = "github-dark" + +html_css_files = [ + 'css/custom.css', +] diff --git a/examples/fast_shape_gen_multiview.py b/examples/fast_shape_gen_multiview.py new file mode 100644 index 0000000..bd7970a --- /dev/null +++ b/examples/fast_shape_gen_multiview.py @@ -0,0 +1,38 @@ +import time + +import torch +from PIL import Image + +from hy3dgen.rembg import BackgroundRemover +from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline + +images = { + "front": "assets/example_mv_images/1/front.png", + "left": "assets/example_mv_images/1/left.png", + "back": "assets/example_mv_images/1/back.png" +} + +for key in images: + image = Image.open(images[key]).convert("RGBA") + if image.mode == 'RGB': + rembg = BackgroundRemover() + image = rembg(image) + images[key] = image + +pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained( + 'tencent/Hunyuan3D-2mv', + subfolder='hunyuan3d-dit-v2-mv-turbo', + variant='fp16' +) +pipeline.enable_flashvdm() +start_time = time.time() +mesh = pipeline( + image=images, + num_inference_steps=5, + octree_resolution=380, + num_chunks=20000, + generator=torch.manual_seed(12345), + output_type='trimesh' +)[0] +print("--- %s seconds ---" % (time.time() - start_time)) +mesh.export(f'demo_mv3.glb') diff --git a/examples/fast_shape_gen_with_flashvdm.py b/examples/fast_shape_gen_with_flashvdm.py new file mode 100644 index 0000000..87f1f26 --- /dev/null +++ b/examples/fast_shape_gen_with_flashvdm.py @@ -0,0 +1,46 @@ +# HY3DGEN_DEBUG=1 USE_SAGEATTN=1 python3 examples/fast_shape_gen_with_flashvdm.py +# HY3DGEN_DEBUG=1 USE_SAGEATTN=0 python3 examples/fast_shape_gen_with_flashvdm.py + +import os +import time + +import torch +from PIL import Image + +from hy3dgen.rembg import BackgroundRemover +from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline + +pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained( + 'tencent/Hunyuan3D-2', + subfolder='hunyuan3d-dit-v2-0-turbo', + use_safetensors=True, +) +pipeline.enable_flashvdm() +# pipeline.compile() + +image_path = 'assets/demo.png' +image = Image.open(image_path).convert("RGBA") +if image.mode == 'RGB': + rembg = BackgroundRemover() + image = rembg(image) + + +def run(): + return pipeline( + image=image, + num_inference_steps=5, + octree_resolution=380, + num_chunks=200000, + generator=torch.manual_seed(12345), + output_type='trimesh' + )[0] + + +save_dir = 'tmp/results/' +os.makedirs(save_dir, exist_ok=True) + +for it in range(2): + start_time = time.time() + mesh = run() + print("--- %s seconds ---" % (time.time() - start_time)) + mesh.export(f'{save_dir}/run_{it}.glb') diff --git a/examples/fast_texture_gen_multiview.py b/examples/fast_texture_gen_multiview.py new file mode 100644 index 0000000..f333f8c --- /dev/null +++ b/examples/fast_texture_gen_multiview.py @@ -0,0 +1,32 @@ +import time + +import torch +from PIL import Image +import trimesh + +from hy3dgen.rembg import BackgroundRemover +from hy3dgen.texgen import Hunyuan3DPaintPipeline + +images_path = [ + "assets/example_mv_images/1/front.png", + "assets/example_mv_images/1/left.png", + "assets/example_mv_images/1/back.png" +] + +images = [] +for image_path in images_path: + image = Image.open(image_path) + if image.mode == 'RGB': + rembg = BackgroundRemover() + image = rembg(image) + images.append(image) + +pipeline = Hunyuan3DPaintPipeline.from_pretrained( + 'tencent/Hunyuan3D-2', + subfolder='hunyuan3d-paint-v2-0-turbo' +) + +mesh = trimesh.load('assets/1.glb') + +mesh = pipeline(mesh, image=images) +mesh.export('demo_textured.glb') \ No newline at end of file diff --git a/examples/faster_shape_gen_with_flashvdm_mini_turbo.py b/examples/faster_shape_gen_with_flashvdm_mini_turbo.py new file mode 100644 index 0000000..6ca8bb0 --- /dev/null +++ b/examples/faster_shape_gen_with_flashvdm_mini_turbo.py @@ -0,0 +1,48 @@ +# HY3DGEN_DEBUG=1 USE_SAGEATTN=1 python3 examples/faster_shape_gen_with_flashvdm_mini_turbo.py +# HY3DGEN_DEBUG=1 USE_SAGEATTN=0 python3 examples/faster_shape_gen_with_flashvdm_mini_turbo.py + +import os +import time + +import torch +from PIL import Image + +from hy3dgen.rembg import BackgroundRemover +from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline + +device = 'cuda' +pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained( + 'tencent/Hunyuan3D-2mini', + subfolder='hunyuan3d-dit-v2-mini-turbo', + use_safetensors=False, + device=device +) +pipeline.enable_flashvdm(topk_mode='merge') +# pipeline.compile() + +image_path = 'assets/demo.png' +image = Image.open(image_path).convert("RGBA") +if image.mode == 'RGB': + rembg = BackgroundRemover() + image = rembg(image) + + +def run(): + return pipeline( + image=image, + num_inference_steps=5, + octree_resolution=380, + num_chunks=20000, + generator=torch.manual_seed(12345), + output_type='trimesh' + )[0] + + +save_dir = 'tmp/results/' +os.makedirs(save_dir, exist_ok=True) + +for it in range(2): + start_time = time.time() + mesh = run() + print("--- %s seconds ---" % (time.time() - start_time)) + mesh.export(f'{save_dir}/run_{it}.glb') diff --git a/examples/shape_gen.py b/examples/shape_gen.py new file mode 100644 index 0000000..6ff4d84 --- /dev/null +++ b/examples/shape_gen.py @@ -0,0 +1,30 @@ +import time + +import torch +from PIL import Image + +from hy3dgen.rembg import BackgroundRemover +from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline + +image_path = 'assets/demo.png' +image = Image.open(image_path).convert("RGBA") +if image.mode == 'RGB': + rembg = BackgroundRemover() + image = rembg(image) + +pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained( + 'tencent/Hunyuan3D-2', + subfolder='hunyuan3d-dit-v2-0', + variant='fp16' +) + +start_time = time.time() +mesh = pipeline(image=image, + num_inference_steps=50, + octree_resolution=380, + num_chunks=20000, + generator=torch.manual_seed(12345), + output_type='trimesh' + )[0] +print("--- %s seconds ---" % (time.time() - start_time)) +mesh.export(f'demo.glb') diff --git a/examples/shape_gen_mini.py b/examples/shape_gen_mini.py new file mode 100644 index 0000000..17c0989 --- /dev/null +++ b/examples/shape_gen_mini.py @@ -0,0 +1,31 @@ +import time + +import torch +from PIL import Image + +from hy3dgen.rembg import BackgroundRemover +from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline + +image_path = 'assets/demo.png' +image = Image.open(image_path).convert("RGBA") +if image.mode == 'RGB': + rembg = BackgroundRemover() + image = rembg(image) + +pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained( + 'tencent/Hunyuan3D-2mini', + subfolder='hunyuan3d-dit-v2-mini', + variant='fp16' +) + +start_time = time.time() +mesh = pipeline( + image=image, + num_inference_steps=50, + octree_resolution=380, + num_chunks=20000, + generator=torch.manual_seed(12345), + output_type='trimesh' +)[0] +print("--- %s seconds ---" % (time.time() - start_time)) +mesh.export(f'demo_mini.glb') diff --git a/examples/shape_gen_multiview.py b/examples/shape_gen_multiview.py new file mode 100644 index 0000000..ff6452d --- /dev/null +++ b/examples/shape_gen_multiview.py @@ -0,0 +1,38 @@ +import time + +import torch +from PIL import Image + +from hy3dgen.rembg import BackgroundRemover +from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline + +images = { + "front": "assets/example_mv_images/1/front.png", + "left": "assets/example_mv_images/1/left.png", + "back": "assets/example_mv_images/1/back.png" +} + +for key in images: + image = Image.open(images[key]).convert("RGBA") + if image.mode == 'RGB': + rembg = BackgroundRemover() + image = rembg(image) + images[key] = image + +pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained( + 'tencent/Hunyuan3D-2mv', + subfolder='hunyuan3d-dit-v2-mv', + variant='fp16' +) + +start_time = time.time() +mesh = pipeline( + image=images, + num_inference_steps=50, + octree_resolution=380, + num_chunks=20000, + generator=torch.manual_seed(12345), + output_type='trimesh' +)[0] +print("--- %s seconds ---" % (time.time() - start_time)) +mesh.export(f'demo_mv.glb') diff --git a/examples/textured_shape_gen.py b/examples/textured_shape_gen.py new file mode 100644 index 0000000..b156c49 --- /dev/null +++ b/examples/textured_shape_gen.py @@ -0,0 +1,19 @@ +from PIL import Image + +from hy3dgen.rembg import BackgroundRemover +from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline +from hy3dgen.texgen import Hunyuan3DPaintPipeline + +model_path = 'tencent/Hunyuan3D-2' +pipeline_shapegen = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(model_path) +pipeline_texgen = Hunyuan3DPaintPipeline.from_pretrained(model_path) + +image_path = 'assets/demo.png' +image = Image.open(image_path).convert("RGBA") +if image.mode == 'RGB': + rembg = BackgroundRemover() + image = rembg(image) + +mesh = pipeline_shapegen(image=image)[0] +mesh = pipeline_texgen(mesh, image=image) +mesh.export('demo.glb') diff --git a/examples/textured_shape_gen_mini.py b/examples/textured_shape_gen_mini.py new file mode 100644 index 0000000..b4901f3 --- /dev/null +++ b/examples/textured_shape_gen_mini.py @@ -0,0 +1,36 @@ +import time + +import torch +from PIL import Image + +from hy3dgen.rembg import BackgroundRemover +from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline +from hy3dgen.texgen import Hunyuan3DPaintPipeline + +image_path = 'assets/demo.png' +image = Image.open(image_path).convert("RGBA") +if image.mode == 'RGB': + rembg = BackgroundRemover() + image = rembg(image) + +pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained( + 'tencent/Hunyuan3D-2mini', + subfolder='hunyuan3d-dit-v2-mini', + variant='fp16' +) +pipeline_texgen = Hunyuan3DPaintPipeline.from_pretrained('tencent/Hunyuan3D-2') + +start_time = time.time() +mesh = pipeline( + image=image, + num_inference_steps=50, + octree_resolution=380, + num_chunks=20000, + generator=torch.manual_seed(12345), + output_type='trimesh' +)[0] +print("--- %s seconds ---" % (time.time() - start_time)) +mesh.export(f'demo_mini.glb') + +mesh = pipeline_texgen(mesh, image=image) +mesh.export('demo_textured_mini.glb') diff --git a/examples/textured_shape_gen_multiview.py b/examples/textured_shape_gen_multiview.py new file mode 100644 index 0000000..8a32c5a --- /dev/null +++ b/examples/textured_shape_gen_multiview.py @@ -0,0 +1,43 @@ +import time + +import torch +from PIL import Image + +from hy3dgen.rembg import BackgroundRemover +from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline +from hy3dgen.texgen import Hunyuan3DPaintPipeline + +images = { + "front": "assets/example_mv_images/1/front.png", + "left": "assets/example_mv_images/1/left.png", + "back": "assets/example_mv_images/1/back.png" +} + +for key in images: + image = Image.open(images[key]).convert("RGBA") + if image.mode == 'RGB': + rembg = BackgroundRemover() + image = rembg(image) + images[key] = image + +pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained( + 'tencent/Hunyuan3D-2mv', + subfolder='hunyuan3d-dit-v2-mv', + variant='fp16' +) +pipeline_texgen = Hunyuan3DPaintPipeline.from_pretrained('tencent/Hunyuan3D-2') + +start_time = time.time() +mesh = pipeline( + image=images, + num_inference_steps=50, + octree_resolution=380, + num_chunks=20000, + generator=torch.manual_seed(12345), + output_type='trimesh' +)[0] +print("--- %s seconds ---" % (time.time() - start_time)) +mesh.export(f'demo_white_mesh_mv.glb') + +mesh = pipeline_texgen(mesh, image=images["front"]) +mesh.export('demo_textured_mv.glb') diff --git a/gradio_app.py b/gradio_app.py new file mode 100644 index 0000000..8d81813 --- /dev/null +++ b/gradio_app.py @@ -0,0 +1,755 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import os +import random +import shutil +import time +from glob import glob +from pathlib import Path + +import gradio as gr +import torch +import trimesh +import uvicorn +from fastapi import FastAPI +from fastapi.staticfiles import StaticFiles +import uuid + +from hy3dgen.shapegen.utils import logger + +MAX_SEED = int(1e7) + + +def get_example_img_list(): + print('Loading example img list ...') + return sorted(glob('./assets/example_images/**/*.png', recursive=True)) + + +def get_example_txt_list(): + print('Loading example txt list ...') + txt_list = list() + for line in open('./assets/example_prompts.txt', encoding='utf-8'): + txt_list.append(line.strip()) + return txt_list + + +def get_example_mv_list(): + print('Loading example mv list ...') + mv_list = list() + root = './assets/example_mv_images' + for mv_dir in os.listdir(root): + view_list = [] + for view in ['front', 'back', 'left', 'right']: + path = os.path.join(root, mv_dir, f'{view}.png') + if os.path.exists(path): + view_list.append(path) + else: + view_list.append(None) + mv_list.append(view_list) + return mv_list + + +def gen_save_folder(max_size=200): + os.makedirs(SAVE_DIR, exist_ok=True) + + # 获取所有文件夹路径 + dirs = [f for f in Path(SAVE_DIR).iterdir() if f.is_dir()] + + # 如果文件夹数量超过 max_size,删除创建时间最久的文件夹 + if len(dirs) >= max_size: + # 按创建时间排序,最久的排在前面 + oldest_dir = min(dirs, key=lambda x: x.stat().st_ctime) + shutil.rmtree(oldest_dir) + print(f"Removed the oldest folder: {oldest_dir}") + + # 生成一个新的 uuid 文件夹名称 + new_folder = os.path.join(SAVE_DIR, str(uuid.uuid4())) + os.makedirs(new_folder, exist_ok=True) + print(f"Created new folder: {new_folder}") + + return new_folder + + +def export_mesh(mesh, save_folder, textured=False, type='glb'): + if textured: + path = os.path.join(save_folder, f'textured_mesh.{type}') + else: + path = os.path.join(save_folder, f'white_mesh.{type}') + if type not in ['glb', 'obj']: + mesh.export(path) + else: + mesh.export(path, include_normals=textured) + return path + + +def randomize_seed_fn(seed: int, randomize_seed: bool) -> int: + if randomize_seed: + seed = random.randint(0, MAX_SEED) + return seed + + +def build_model_viewer_html(save_folder, height=660, width=790, textured=False): + # Remove first folder from path to make relative path + if textured: + related_path = f"./textured_mesh.glb" + template_name = './assets/modelviewer-textured-template.html' + output_html_path = os.path.join(save_folder, f'textured_mesh.html') + else: + related_path = f"./white_mesh.glb" + template_name = './assets/modelviewer-template.html' + output_html_path = os.path.join(save_folder, f'white_mesh.html') + offset = 50 if textured else 10 + with open(os.path.join(CURRENT_DIR, template_name), 'r', encoding='utf-8') as f: + template_html = f.read() + + with open(output_html_path, 'w', encoding='utf-8') as f: + template_html = template_html.replace('#height#', f'{height - offset}') + template_html = template_html.replace('#width#', f'{width}') + template_html = template_html.replace('#src#', f'{related_path}/') + f.write(template_html) + + rel_path = os.path.relpath(output_html_path, SAVE_DIR) + iframe_tag = f'' + print( + f'Find html file {output_html_path}, {os.path.exists(output_html_path)}, relative HTML path is /static/{rel_path}') + + return f""" +
+ {iframe_tag} +
+ """ + + +def _gen_shape( + caption=None, + image=None, + mv_image_front=None, + mv_image_back=None, + mv_image_left=None, + mv_image_right=None, + steps=50, + guidance_scale=7.5, + seed=1234, + octree_resolution=256, + check_box_rembg=False, + num_chunks=200000, + randomize_seed: bool = False, +): + if not MV_MODE and image is None and caption is None: + raise gr.Error("Please provide either a caption or an image.") + if MV_MODE: + if mv_image_front is None and mv_image_back is None and mv_image_left is None and mv_image_right is None: + raise gr.Error("Please provide at least one view image.") + image = {} + if mv_image_front: + image['front'] = mv_image_front + if mv_image_back: + image['back'] = mv_image_back + if mv_image_left: + image['left'] = mv_image_left + if mv_image_right: + image['right'] = mv_image_right + + seed = int(randomize_seed_fn(seed, randomize_seed)) + + octree_resolution = int(octree_resolution) + if caption: print('prompt is', caption) + save_folder = gen_save_folder() + stats = { + 'model': { + 'shapegen': f'{args.model_path}/{args.subfolder}', + 'texgen': f'{args.texgen_model_path}', + }, + 'params': { + 'caption': caption, + 'steps': steps, + 'guidance_scale': guidance_scale, + 'seed': seed, + 'octree_resolution': octree_resolution, + 'check_box_rembg': check_box_rembg, + 'num_chunks': num_chunks, + } + } + time_meta = {} + + if image is None: + start_time = time.time() + try: + image = t2i_worker(caption) + except Exception as e: + raise gr.Error(f"Text to 3D is disable. Please enable it by `python gradio_app.py --enable_t23d`.") + time_meta['text2image'] = time.time() - start_time + + # remove disk io to make responding faster, uncomment at your will. + # image.save(os.path.join(save_folder, 'input.png')) + if MV_MODE: + start_time = time.time() + for k, v in image.items(): + if check_box_rembg or v.mode == "RGB": + img = rmbg_worker(v.convert('RGB')) + image[k] = img + time_meta['remove background'] = time.time() - start_time + else: + if check_box_rembg or image.mode == "RGB": + start_time = time.time() + image = rmbg_worker(image.convert('RGB')) + time_meta['remove background'] = time.time() - start_time + + # remove disk io to make responding faster, uncomment at your will. + # image.save(os.path.join(save_folder, 'rembg.png')) + + # image to white model + start_time = time.time() + + generator = torch.Generator() + generator = generator.manual_seed(int(seed)) + outputs = i23d_worker( + image=image, + num_inference_steps=steps, + guidance_scale=guidance_scale, + generator=generator, + octree_resolution=octree_resolution, + num_chunks=num_chunks, + output_type='mesh' + ) + time_meta['shape generation'] = time.time() - start_time + logger.info("---Shape generation takes %s seconds ---" % (time.time() - start_time)) + + tmp_start = time.time() + mesh = export_to_trimesh(outputs)[0] + time_meta['export to trimesh'] = time.time() - tmp_start + + stats['number_of_faces'] = mesh.faces.shape[0] + stats['number_of_vertices'] = mesh.vertices.shape[0] + + stats['time'] = time_meta + main_image = image if not MV_MODE else image['front'] + return mesh, main_image, save_folder, stats, seed + + +def generation_all( + caption=None, + image=None, + mv_image_front=None, + mv_image_back=None, + mv_image_left=None, + mv_image_right=None, + steps=50, + guidance_scale=7.5, + seed=1234, + octree_resolution=256, + check_box_rembg=False, + num_chunks=200000, + randomize_seed: bool = False, +): + start_time_0 = time.time() + mesh, image, save_folder, stats, seed = _gen_shape( + caption, + image, + mv_image_front=mv_image_front, + mv_image_back=mv_image_back, + mv_image_left=mv_image_left, + mv_image_right=mv_image_right, + steps=steps, + guidance_scale=guidance_scale, + seed=seed, + octree_resolution=octree_resolution, + check_box_rembg=check_box_rembg, + num_chunks=num_chunks, + randomize_seed=randomize_seed, + ) + path = export_mesh(mesh, save_folder, textured=False) + + # tmp_time = time.time() + # mesh = floater_remove_worker(mesh) + # mesh = degenerate_face_remove_worker(mesh) + # logger.info("---Postprocessing takes %s seconds ---" % (time.time() - tmp_time)) + # stats['time']['postprocessing'] = time.time() - tmp_time + + tmp_time = time.time() + mesh = face_reduce_worker(mesh) + logger.info("---Face Reduction takes %s seconds ---" % (time.time() - tmp_time)) + stats['time']['face reduction'] = time.time() - tmp_time + + tmp_time = time.time() + textured_mesh = texgen_worker(mesh, image) + logger.info("---Texture Generation takes %s seconds ---" % (time.time() - tmp_time)) + stats['time']['texture generation'] = time.time() - tmp_time + stats['time']['total'] = time.time() - start_time_0 + + textured_mesh.metadata['extras'] = stats + path_textured = export_mesh(textured_mesh, save_folder, textured=True) + model_viewer_html_textured = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH, + textured=True) + if args.low_vram_mode: + torch.cuda.empty_cache() + return ( + gr.update(value=path), + gr.update(value=path_textured), + model_viewer_html_textured, + stats, + seed, + ) + + +def shape_generation( + caption=None, + image=None, + mv_image_front=None, + mv_image_back=None, + mv_image_left=None, + mv_image_right=None, + steps=50, + guidance_scale=7.5, + seed=1234, + octree_resolution=256, + check_box_rembg=False, + num_chunks=200000, + randomize_seed: bool = False, +): + start_time_0 = time.time() + mesh, image, save_folder, stats, seed = _gen_shape( + caption, + image, + mv_image_front=mv_image_front, + mv_image_back=mv_image_back, + mv_image_left=mv_image_left, + mv_image_right=mv_image_right, + steps=steps, + guidance_scale=guidance_scale, + seed=seed, + octree_resolution=octree_resolution, + check_box_rembg=check_box_rembg, + num_chunks=num_chunks, + randomize_seed=randomize_seed, + ) + stats['time']['total'] = time.time() - start_time_0 + mesh.metadata['extras'] = stats + + path = export_mesh(mesh, save_folder, textured=False) + model_viewer_html = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH) + if args.low_vram_mode: + torch.cuda.empty_cache() + return ( + gr.update(value=path), + model_viewer_html, + stats, + seed, + ) + + +def build_app(): + title = 'Hunyuan3D-2: High Resolution Textured 3D Assets Generation' + if MV_MODE: + title = 'Hunyuan3D-2mv: Image to 3D Generation with 1-4 Views' + if 'mini' in args.subfolder: + title = 'Hunyuan3D-2mini: Strong 0.6B Image to Shape Generator' + if TURBO_MODE: + title = title.replace(':', '-Turbo: Fast ') + + title_html = f""" +
+ + {title} +
+
+ Tencent Hunyuan3D Team +
+
+ Github   + Homepage   + Hunyuan3D Studio   + Technical Report   + Pretrained Models   +
+ """ + custom_css = """ + .app.svelte-wpkpf6.svelte-wpkpf6:not(.fill_width) { + max-width: 1480px; + } + .mv-image button .wrap { + font-size: 10px; + } + + .mv-image .icon-wrap { + width: 20px; + } + + """ + + with gr.Blocks(theme=gr.themes.Base(), title='Hunyuan-3D-2.0', analytics_enabled=False, css=custom_css) as demo: + gr.HTML(title_html) + + with gr.Row(): + with gr.Column(scale=3): + with gr.Tabs(selected='tab_img_prompt') as tabs_prompt: + with gr.Tab('Image Prompt', id='tab_img_prompt', visible=not MV_MODE) as tab_ip: + image = gr.Image(label='Image', type='pil', image_mode='RGBA', height=290) + + with gr.Tab('Text Prompt', id='tab_txt_prompt', visible=HAS_T2I and not MV_MODE) as tab_tp: + caption = gr.Textbox(label='Text Prompt', + placeholder='HunyuanDiT will be used to generate image.', + info='Example: A 3D model of a cute cat, white background') + with gr.Tab('MultiView Prompt', visible=MV_MODE) as tab_mv: + # gr.Label('Please upload at least one front image.') + with gr.Row(): + mv_image_front = gr.Image(label='Front', type='pil', image_mode='RGBA', height=140, + min_width=100, elem_classes='mv-image') + mv_image_back = gr.Image(label='Back', type='pil', image_mode='RGBA', height=140, + min_width=100, elem_classes='mv-image') + with gr.Row(): + mv_image_left = gr.Image(label='Left', type='pil', image_mode='RGBA', height=140, + min_width=100, elem_classes='mv-image') + mv_image_right = gr.Image(label='Right', type='pil', image_mode='RGBA', height=140, + min_width=100, elem_classes='mv-image') + + with gr.Row(): + btn = gr.Button(value='Gen Shape', variant='primary', min_width=100) + btn_all = gr.Button(value='Gen Textured Shape', + variant='primary', + visible=HAS_TEXTUREGEN, + min_width=100) + + with gr.Group(): + file_out = gr.File(label="File", visible=False) + file_out2 = gr.File(label="File", visible=False) + + with gr.Tabs(selected='tab_options' if TURBO_MODE else 'tab_export'): + with gr.Tab("Options", id='tab_options', visible=TURBO_MODE): + gen_mode = gr.Radio(label='Generation Mode', + info='Recommendation: Turbo for most cases, Fast for very complex cases, Standard seldom use.', + choices=['Turbo', 'Fast', 'Standard'], value='Turbo') + decode_mode = gr.Radio(label='Decoding Mode', + info='The resolution for exporting mesh from generated vectset', + choices=['Low', 'Standard', 'High'], + value='Standard') + with gr.Tab('Advanced Options', id='tab_advanced_options'): + with gr.Row(): + check_box_rembg = gr.Checkbox(value=True, label='Remove Background', min_width=100) + randomize_seed = gr.Checkbox(label="Randomize seed", value=True, min_width=100) + seed = gr.Slider( + label="Seed", + minimum=0, + maximum=MAX_SEED, + step=1, + value=1234, + min_width=100, + ) + with gr.Row(): + num_steps = gr.Slider(maximum=100, + minimum=1, + value=5 if 'turbo' in args.subfolder else 30, + step=1, label='Inference Steps') + octree_resolution = gr.Slider(maximum=512, minimum=16, value=256, label='Octree Resolution') + with gr.Row(): + cfg_scale = gr.Number(value=5.0, label='Guidance Scale', min_width=100) + num_chunks = gr.Slider(maximum=5000000, minimum=1000, value=8000, + label='Number of Chunks', min_width=100) + with gr.Tab("Export", id='tab_export'): + with gr.Row(): + file_type = gr.Dropdown(label='File Type', choices=SUPPORTED_FORMATS, + value='glb', min_width=100) + reduce_face = gr.Checkbox(label='Simplify Mesh', value=False, min_width=100) + export_texture = gr.Checkbox(label='Include Texture', value=False, + visible=False, min_width=100) + target_face_num = gr.Slider(maximum=1000000, minimum=100, value=10000, + label='Target Face Number') + with gr.Row(): + confirm_export = gr.Button(value="Transform", min_width=100) + file_export = gr.DownloadButton(label="Download", variant='primary', + interactive=False, min_width=100) + + with gr.Column(scale=6): + with gr.Tabs(selected='gen_mesh_panel') as tabs_output: + with gr.Tab('Generated Mesh', id='gen_mesh_panel'): + html_gen_mesh = gr.HTML(HTML_OUTPUT_PLACEHOLDER, label='Output') + with gr.Tab('Exporting Mesh', id='export_mesh_panel'): + html_export_mesh = gr.HTML(HTML_OUTPUT_PLACEHOLDER, label='Output') + with gr.Tab('Mesh Statistic', id='stats_panel'): + stats = gr.Json({}, label='Mesh Stats') + + with gr.Column(scale=3 if MV_MODE else 2): + with gr.Tabs(selected='tab_img_gallery') as gallery: + with gr.Tab('Image to 3D Gallery', id='tab_img_gallery', visible=not MV_MODE) as tab_gi: + with gr.Row(): + gr.Examples(examples=example_is, inputs=[image], + label=None, examples_per_page=18) + + with gr.Tab('Text to 3D Gallery', id='tab_txt_gallery', visible=HAS_T2I and not MV_MODE) as tab_gt: + with gr.Row(): + gr.Examples(examples=example_ts, inputs=[caption], + label=None, examples_per_page=18) + with gr.Tab('MultiView to 3D Gallery', id='tab_mv_gallery', visible=MV_MODE) as tab_mv: + with gr.Row(): + gr.Examples(examples=example_mvs, + inputs=[mv_image_front, mv_image_back, mv_image_left, mv_image_right], + label=None, examples_per_page=6) + + gr.HTML(f""" +
+ Activated Model - Shape Generation ({args.model_path}/{args.subfolder}) ; Texture Generation ({'Hunyuan3D-2' if HAS_TEXTUREGEN else 'Unavailable'}) +
+ """) + if not HAS_TEXTUREGEN: + gr.HTML(""" +
+ Warning: + Texture synthesis is disable due to missing requirements, + please install requirements following README.mdto activate it. +
+ """) + if not args.enable_t23d: + gr.HTML(""" +
+ Warning: + Text to 3D is disable. To activate it, please run `python gradio_app.py --enable_t23d`. +
+ """) + + tab_ip.select(fn=lambda: gr.update(selected='tab_img_gallery'), outputs=gallery) + if HAS_T2I: + tab_tp.select(fn=lambda: gr.update(selected='tab_txt_gallery'), outputs=gallery) + + btn.click( + shape_generation, + inputs=[ + caption, + image, + mv_image_front, + mv_image_back, + mv_image_left, + mv_image_right, + num_steps, + cfg_scale, + seed, + octree_resolution, + check_box_rembg, + num_chunks, + randomize_seed, + ], + outputs=[file_out, html_gen_mesh, stats, seed] + ).then( + lambda: (gr.update(visible=False, value=False), gr.update(interactive=True), gr.update(interactive=True), + gr.update(interactive=False)), + outputs=[export_texture, reduce_face, confirm_export, file_export], + ).then( + lambda: gr.update(selected='gen_mesh_panel'), + outputs=[tabs_output], + ) + + btn_all.click( + generation_all, + inputs=[ + caption, + image, + mv_image_front, + mv_image_back, + mv_image_left, + mv_image_right, + num_steps, + cfg_scale, + seed, + octree_resolution, + check_box_rembg, + num_chunks, + randomize_seed, + ], + outputs=[file_out, file_out2, html_gen_mesh, stats, seed] + ).then( + lambda: (gr.update(visible=True, value=True), gr.update(interactive=False), gr.update(interactive=True), + gr.update(interactive=False)), + outputs=[export_texture, reduce_face, confirm_export, file_export], + ).then( + lambda: gr.update(selected='gen_mesh_panel'), + outputs=[tabs_output], + ) + + def on_gen_mode_change(value): + if value == 'Turbo': + return gr.update(value=5) + elif value == 'Fast': + return gr.update(value=10) + else: + return gr.update(value=30) + + gen_mode.change(on_gen_mode_change, inputs=[gen_mode], outputs=[num_steps]) + + def on_decode_mode_change(value): + if value == 'Low': + return gr.update(value=196) + elif value == 'Standard': + return gr.update(value=256) + else: + return gr.update(value=384) + + decode_mode.change(on_decode_mode_change, inputs=[decode_mode], outputs=[octree_resolution]) + + def on_export_click(file_out, file_out2, file_type, reduce_face, export_texture, target_face_num): + if file_out is None: + raise gr.Error('Please generate a mesh first.') + + print(f'exporting {file_out}') + print(f'reduce face to {target_face_num}') + if export_texture: + mesh = trimesh.load(file_out2) + save_folder = gen_save_folder() + path = export_mesh(mesh, save_folder, textured=True, type=file_type) + + # for preview + save_folder = gen_save_folder() + _ = export_mesh(mesh, save_folder, textured=True) + model_viewer_html = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH, + textured=True) + else: + mesh = trimesh.load(file_out) + mesh = floater_remove_worker(mesh) + mesh = degenerate_face_remove_worker(mesh) + if reduce_face: + mesh = face_reduce_worker(mesh, target_face_num) + save_folder = gen_save_folder() + path = export_mesh(mesh, save_folder, textured=False, type=file_type) + + # for preview + save_folder = gen_save_folder() + _ = export_mesh(mesh, save_folder, textured=False) + model_viewer_html = build_model_viewer_html(save_folder, height=HTML_HEIGHT, width=HTML_WIDTH, + textured=False) + print(f'export to {path}') + return model_viewer_html, gr.update(value=path, interactive=True) + + confirm_export.click( + lambda: gr.update(selected='export_mesh_panel'), + outputs=[tabs_output], + ).then( + on_export_click, + inputs=[file_out, file_out2, file_type, reduce_face, export_texture, target_face_num], + outputs=[html_export_mesh, file_export] + ) + + return demo + + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--model_path", type=str, default='tencent/Hunyuan3D-2mini') + parser.add_argument("--subfolder", type=str, default='hunyuan3d-dit-v2-mini-turbo') + parser.add_argument("--texgen_model_path", type=str, default='tencent/Hunyuan3D-2') + parser.add_argument('--port', type=int, default=8080) + parser.add_argument('--host', type=str, default='0.0.0.0') + parser.add_argument('--device', type=str, default='cuda') + parser.add_argument('--mc_algo', type=str, default='mc') + parser.add_argument('--cache-path', type=str, default='gradio_cache') + parser.add_argument('--enable_t23d', action='store_true') + parser.add_argument('--disable_tex', action='store_true') + parser.add_argument('--enable_flashvdm', action='store_true') + parser.add_argument('--compile', action='store_true') + parser.add_argument('--low_vram_mode', action='store_true') + args = parser.parse_args() + + SAVE_DIR = args.cache_path + os.makedirs(SAVE_DIR, exist_ok=True) + + CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) + MV_MODE = 'mv' in args.model_path + TURBO_MODE = 'turbo' in args.subfolder + + HTML_HEIGHT = 690 if MV_MODE else 650 + HTML_WIDTH = 500 + HTML_OUTPUT_PLACEHOLDER = f""" +
+
+

Welcome to Hunyuan3D!

+

No mesh here.

+
+
+ """ + + INPUT_MESH_HTML = """ +
+
+ """ + example_is = get_example_img_list() + example_ts = get_example_txt_list() + example_mvs = get_example_mv_list() + + SUPPORTED_FORMATS = ['glb', 'obj', 'ply', 'stl'] + + HAS_TEXTUREGEN = False + if not args.disable_tex: + try: + from hy3dgen.texgen import Hunyuan3DPaintPipeline + + texgen_worker = Hunyuan3DPaintPipeline.from_pretrained(args.texgen_model_path) + if args.low_vram_mode: + texgen_worker.enable_model_cpu_offload() + # Not help much, ignore for now. + # if args.compile: + # texgen_worker.models['delight_model'].pipeline.unet.compile() + # texgen_worker.models['delight_model'].pipeline.vae.compile() + # texgen_worker.models['multiview_model'].pipeline.unet.compile() + # texgen_worker.models['multiview_model'].pipeline.vae.compile() + HAS_TEXTUREGEN = True + except Exception as e: + print(e) + print("Failed to load texture generator.") + print('Please try to install requirements by following README.md') + HAS_TEXTUREGEN = False + + HAS_T2I = True + if args.enable_t23d: + from hy3dgen.text2image import HunyuanDiTPipeline + + t2i_worker = HunyuanDiTPipeline('Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled', device=args.device) + HAS_T2I = True + + from hy3dgen.shapegen import FaceReducer, FloaterRemover, DegenerateFaceRemover, MeshSimplifier, \ + Hunyuan3DDiTFlowMatchingPipeline + from hy3dgen.shapegen.pipelines import export_to_trimesh + from hy3dgen.rembg import BackgroundRemover + + rmbg_worker = BackgroundRemover() + i23d_worker = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained( + args.model_path, + subfolder=args.subfolder, + use_safetensors=True, + device=args.device, + ) + if args.enable_flashvdm: + mc_algo = 'mc' if args.device in ['cpu', 'mps'] else args.mc_algo + i23d_worker.enable_flashvdm(mc_algo=mc_algo) + if args.compile: + i23d_worker.compile() + + floater_remove_worker = FloaterRemover() + degenerate_face_remove_worker = DegenerateFaceRemover() + face_reduce_worker = FaceReducer() + + # https://discuss.huggingface.co/t/how-to-serve-an-html-file/33921/2 + # create a FastAPI app + app = FastAPI() + # create a static directory to store the static files + static_dir = Path(SAVE_DIR).absolute() + static_dir.mkdir(parents=True, exist_ok=True) + app.mount("/static", StaticFiles(directory=static_dir, html=True), name="static") + shutil.copytree('./assets/env_maps', os.path.join(static_dir, 'env_maps'), dirs_exist_ok=True) + + if args.low_vram_mode: + torch.cuda.empty_cache() + demo = build_app() + app = gr.mount_gradio_app(app, demo, path="/") + uvicorn.run(app, host=args.host, port=args.port, workers=1) diff --git a/hy3dgen/__init__.py b/hy3dgen/__init__.py new file mode 100644 index 0000000..8bb2bf8 --- /dev/null +++ b/hy3dgen/__init__.py @@ -0,0 +1,13 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. \ No newline at end of file diff --git a/hy3dgen/rembg.py b/hy3dgen/rembg.py new file mode 100644 index 0000000..6247f06 --- /dev/null +++ b/hy3dgen/rembg.py @@ -0,0 +1,25 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from PIL import Image +from rembg import remove, new_session + + +class BackgroundRemover(): + def __init__(self): + self.session = new_session() + + def __call__(self, image: Image.Image): + output = remove(image, session=self.session, bgcolor=[255, 255, 255, 0]) + return output diff --git a/hy3dgen/shapegen/__init__.py b/hy3dgen/shapegen/__init__.py new file mode 100644 index 0000000..1b1f9cc --- /dev/null +++ b/hy3dgen/shapegen/__init__.py @@ -0,0 +1,17 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline +from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover, MeshSimplifier +from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR diff --git a/hy3dgen/shapegen/models/__init__.py b/hy3dgen/shapegen/models/__init__.py new file mode 100644 index 0000000..8179353 --- /dev/null +++ b/hy3dgen/shapegen/models/__init__.py @@ -0,0 +1,28 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +from .autoencoders import ShapeVAE +from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder +from .denoisers import Hunyuan3DDiT diff --git a/hy3dgen/shapegen/models/autoencoders/__init__.py b/hy3dgen/shapegen/models/autoencoders/__init__.py new file mode 100644 index 0000000..20bbf8d --- /dev/null +++ b/hy3dgen/shapegen/models/autoencoders/__init__.py @@ -0,0 +1,20 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from .attention_blocks import CrossAttentionDecoder +from .attention_processors import FlashVDMCrossAttentionProcessor, CrossAttentionProcessor, \ + FlashVDMTopMCrossAttentionProcessor +from .model import ShapeVAE, VectsetVAE +from .surface_extractors import SurfaceExtractors, MCSurfaceExtractor, DMCSurfaceExtractor, Latent2MeshOutput +from .volume_decoders import HierarchicalVolumeDecoding, FlashVDMVolumeDecoding, VanillaVolumeDecoder diff --git a/hy3dgen/shapegen/models/autoencoders/attention_blocks.py b/hy3dgen/shapegen/models/autoencoders/attention_blocks.py new file mode 100644 index 0000000..ab34eeb --- /dev/null +++ b/hy3dgen/shapegen/models/autoencoders/attention_blocks.py @@ -0,0 +1,493 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +import os +from typing import Optional + +import torch +import torch.nn as nn +from einops import rearrange + +from .attention_processors import CrossAttentionProcessor +from ...utils import logger + +scaled_dot_product_attention = nn.functional.scaled_dot_product_attention + +if os.environ.get('USE_SAGEATTN', '0') == '1': + try: + from sageattention import sageattn + except ImportError: + raise ImportError('Please install the package "sageattention" to use this USE_SAGEATTN.') + scaled_dot_product_attention = sageattn + + +class FourierEmbedder(nn.Module): + """The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts + each feature dimension of `x[..., i]` into: + [ + sin(x[..., i]), + sin(f_1*x[..., i]), + sin(f_2*x[..., i]), + ... + sin(f_N * x[..., i]), + cos(x[..., i]), + cos(f_1*x[..., i]), + cos(f_2*x[..., i]), + ... + cos(f_N * x[..., i]), + x[..., i] # only present if include_input is True. + ], here f_i is the frequency. + + Denote the space is [0 / num_freqs, 1 / num_freqs, 2 / num_freqs, 3 / num_freqs, ..., (num_freqs - 1) / num_freqs]. + If logspace is True, then the frequency f_i is [2^(0 / num_freqs), ..., 2^(i / num_freqs), ...]; + Otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)]. + + Args: + num_freqs (int): the number of frequencies, default is 6; + logspace (bool): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...], + otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)]; + input_dim (int): the input dimension, default is 3; + include_input (bool): include the input tensor or not, default is True. + + Attributes: + frequencies (torch.Tensor): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...], + otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1); + + out_dim (int): the embedding size, if include_input is True, it is input_dim * (num_freqs * 2 + 1), + otherwise, it is input_dim * num_freqs * 2. + + """ + + def __init__(self, + num_freqs: int = 6, + logspace: bool = True, + input_dim: int = 3, + include_input: bool = True, + include_pi: bool = True) -> None: + + """The initialization""" + + super().__init__() + + if logspace: + frequencies = 2.0 ** torch.arange( + num_freqs, + dtype=torch.float32 + ) + else: + frequencies = torch.linspace( + 1.0, + 2.0 ** (num_freqs - 1), + num_freqs, + dtype=torch.float32 + ) + + if include_pi: + frequencies *= torch.pi + + self.register_buffer("frequencies", frequencies, persistent=False) + self.include_input = include_input + self.num_freqs = num_freqs + + self.out_dim = self.get_dims(input_dim) + + def get_dims(self, input_dim): + temp = 1 if self.include_input or self.num_freqs == 0 else 0 + out_dim = input_dim * (self.num_freqs * 2 + temp) + + return out_dim + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ Forward process. + + Args: + x: tensor of shape [..., dim] + + Returns: + embedding: an embedding of `x` of shape [..., dim * (num_freqs * 2 + temp)] + where temp is 1 if include_input is True and 0 otherwise. + """ + + if self.num_freqs > 0: + embed = (x[..., None].contiguous() * self.frequencies).view(*x.shape[:-1], -1) + if self.include_input: + return torch.cat((x, embed.sin(), embed.cos()), dim=-1) + else: + return torch.cat((embed.sin(), embed.cos()), dim=-1) + else: + return x + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for + changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use + 'survival rate' as the argument. + + """ + if self.drop_prob == 0. or not self.training: + return x + keep_prob = 1 - self.drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0 and self.scale_by_keep: + random_tensor.div_(keep_prob) + return x * random_tensor + + def extra_repr(self): + return f'drop_prob={round(self.drop_prob, 3):0.3f}' + + +class MLP(nn.Module): + def __init__( + self, *, + width: int, + expand_ratio: int = 4, + output_width: int = None, + drop_path_rate: float = 0.0 + ): + super().__init__() + self.width = width + self.c_fc = nn.Linear(width, width * expand_ratio) + self.c_proj = nn.Linear(width * expand_ratio, output_width if output_width is not None else width) + self.gelu = nn.GELU() + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() + + def forward(self, x): + return self.drop_path(self.c_proj(self.gelu(self.c_fc(x)))) + + +class QKVMultiheadCrossAttention(nn.Module): + def __init__( + self, + *, + heads: int, + n_data: Optional[int] = None, + width=None, + qk_norm=False, + norm_layer=nn.LayerNorm + ): + super().__init__() + self.heads = heads + self.n_data = n_data + self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() + self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() + + self.attn_processor = CrossAttentionProcessor() + + def forward(self, q, kv): + _, n_ctx, _ = q.shape + bs, n_data, width = kv.shape + attn_ch = width // self.heads // 2 + q = q.view(bs, n_ctx, self.heads, -1) + kv = kv.view(bs, n_data, self.heads, -1) + k, v = torch.split(kv, attn_ch, dim=-1) + + q = self.q_norm(q) + k = self.k_norm(k) + q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v)) + out = self.attn_processor(self, q, k, v) + out = out.transpose(1, 2).reshape(bs, n_ctx, -1) + return out + + +class MultiheadCrossAttention(nn.Module): + def __init__( + self, + *, + width: int, + heads: int, + qkv_bias: bool = True, + n_data: Optional[int] = None, + data_width: Optional[int] = None, + norm_layer=nn.LayerNorm, + qk_norm: bool = False, + kv_cache: bool = False, + ): + super().__init__() + self.n_data = n_data + self.width = width + self.heads = heads + self.data_width = width if data_width is None else data_width + self.c_q = nn.Linear(width, width, bias=qkv_bias) + self.c_kv = nn.Linear(self.data_width, width * 2, bias=qkv_bias) + self.c_proj = nn.Linear(width, width) + self.attention = QKVMultiheadCrossAttention( + heads=heads, + n_data=n_data, + width=width, + norm_layer=norm_layer, + qk_norm=qk_norm + ) + self.kv_cache = kv_cache + self.data = None + + def forward(self, x, data): + x = self.c_q(x) + if self.kv_cache: + if self.data is None: + self.data = self.c_kv(data) + logger.info('Save kv cache,this should be called only once for one mesh') + data = self.data + else: + data = self.c_kv(data) + x = self.attention(x, data) + x = self.c_proj(x) + return x + + +class ResidualCrossAttentionBlock(nn.Module): + def __init__( + self, + *, + n_data: Optional[int] = None, + width: int, + heads: int, + mlp_expand_ratio: int = 4, + data_width: Optional[int] = None, + qkv_bias: bool = True, + norm_layer=nn.LayerNorm, + qk_norm: bool = False + ): + super().__init__() + + if data_width is None: + data_width = width + + self.attn = MultiheadCrossAttention( + n_data=n_data, + width=width, + heads=heads, + data_width=data_width, + qkv_bias=qkv_bias, + norm_layer=norm_layer, + qk_norm=qk_norm + ) + self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6) + self.ln_2 = norm_layer(data_width, elementwise_affine=True, eps=1e-6) + self.ln_3 = norm_layer(width, elementwise_affine=True, eps=1e-6) + self.mlp = MLP(width=width, expand_ratio=mlp_expand_ratio) + + def forward(self, x: torch.Tensor, data: torch.Tensor): + x = x + self.attn(self.ln_1(x), self.ln_2(data)) + x = x + self.mlp(self.ln_3(x)) + return x + + +class QKVMultiheadAttention(nn.Module): + def __init__( + self, + *, + heads: int, + n_ctx: int, + width=None, + qk_norm=False, + norm_layer=nn.LayerNorm + ): + super().__init__() + self.heads = heads + self.n_ctx = n_ctx + self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() + self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity() + + def forward(self, qkv): + bs, n_ctx, width = qkv.shape + attn_ch = width // self.heads // 3 + qkv = qkv.view(bs, n_ctx, self.heads, -1) + q, k, v = torch.split(qkv, attn_ch, dim=-1) + + q = self.q_norm(q) + k = self.k_norm(k) + + q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v)) + out = scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1) + return out + + +class MultiheadAttention(nn.Module): + def __init__( + self, + *, + n_ctx: int, + width: int, + heads: int, + qkv_bias: bool, + norm_layer=nn.LayerNorm, + qk_norm: bool = False, + drop_path_rate: float = 0.0 + ): + super().__init__() + self.n_ctx = n_ctx + self.width = width + self.heads = heads + self.c_qkv = nn.Linear(width, width * 3, bias=qkv_bias) + self.c_proj = nn.Linear(width, width) + self.attention = QKVMultiheadAttention( + heads=heads, + n_ctx=n_ctx, + width=width, + norm_layer=norm_layer, + qk_norm=qk_norm + ) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() + + def forward(self, x): + x = self.c_qkv(x) + x = self.attention(x) + x = self.drop_path(self.c_proj(x)) + return x + + +class ResidualAttentionBlock(nn.Module): + def __init__( + self, + *, + n_ctx: int, + width: int, + heads: int, + qkv_bias: bool = True, + norm_layer=nn.LayerNorm, + qk_norm: bool = False, + drop_path_rate: float = 0.0, + ): + super().__init__() + self.attn = MultiheadAttention( + n_ctx=n_ctx, + width=width, + heads=heads, + qkv_bias=qkv_bias, + norm_layer=norm_layer, + qk_norm=qk_norm, + drop_path_rate=drop_path_rate + ) + self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6) + self.mlp = MLP(width=width, drop_path_rate=drop_path_rate) + self.ln_2 = norm_layer(width, elementwise_affine=True, eps=1e-6) + + def forward(self, x: torch.Tensor): + x = x + self.attn(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Module): + def __init__( + self, + *, + n_ctx: int, + width: int, + layers: int, + heads: int, + qkv_bias: bool = True, + norm_layer=nn.LayerNorm, + qk_norm: bool = False, + drop_path_rate: float = 0.0 + ): + super().__init__() + self.n_ctx = n_ctx + self.width = width + self.layers = layers + self.resblocks = nn.ModuleList( + [ + ResidualAttentionBlock( + n_ctx=n_ctx, + width=width, + heads=heads, + qkv_bias=qkv_bias, + norm_layer=norm_layer, + qk_norm=qk_norm, + drop_path_rate=drop_path_rate + ) + for _ in range(layers) + ] + ) + + def forward(self, x: torch.Tensor): + for block in self.resblocks: + x = block(x) + return x + + +class CrossAttentionDecoder(nn.Module): + + def __init__( + self, + *, + num_latents: int, + out_channels: int, + fourier_embedder: FourierEmbedder, + width: int, + heads: int, + mlp_expand_ratio: int = 4, + downsample_ratio: int = 1, + enable_ln_post: bool = True, + qkv_bias: bool = True, + qk_norm: bool = False, + label_type: str = "binary" + ): + super().__init__() + + self.enable_ln_post = enable_ln_post + self.fourier_embedder = fourier_embedder + self.downsample_ratio = downsample_ratio + self.query_proj = nn.Linear(self.fourier_embedder.out_dim, width) + if self.downsample_ratio != 1: + self.latents_proj = nn.Linear(width * downsample_ratio, width) + if self.enable_ln_post == False: + qk_norm = False + self.cross_attn_decoder = ResidualCrossAttentionBlock( + n_data=num_latents, + width=width, + mlp_expand_ratio=mlp_expand_ratio, + heads=heads, + qkv_bias=qkv_bias, + qk_norm=qk_norm + ) + + if self.enable_ln_post: + self.ln_post = nn.LayerNorm(width) + self.output_proj = nn.Linear(width, out_channels) + self.label_type = label_type + self.count = 0 + + def set_cross_attention_processor(self, processor): + self.cross_attn_decoder.attn.attention.attn_processor = processor + + def set_default_cross_attention_processor(self): + self.cross_attn_decoder.attn.attention.attn_processor = CrossAttentionProcessor + + def forward(self, queries=None, query_embeddings=None, latents=None): + if query_embeddings is None: + query_embeddings = self.query_proj(self.fourier_embedder(queries).to(latents.dtype)) + self.count += query_embeddings.shape[1] + if self.downsample_ratio != 1: + latents = self.latents_proj(latents) + x = self.cross_attn_decoder(query_embeddings, latents) + if self.enable_ln_post: + x = self.ln_post(x) + occ = self.output_proj(x) + return occ diff --git a/hy3dgen/shapegen/models/autoencoders/attention_processors.py b/hy3dgen/shapegen/models/autoencoders/attention_processors.py new file mode 100644 index 0000000..f7b232e --- /dev/null +++ b/hy3dgen/shapegen/models/autoencoders/attention_processors.py @@ -0,0 +1,96 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import os + +import torch +import torch.nn.functional as F + +scaled_dot_product_attention = F.scaled_dot_product_attention +if os.environ.get('CA_USE_SAGEATTN', '0') == '1': + try: + from sageattention import sageattn + except ImportError: + raise ImportError('Please install the package "sageattention" to use this USE_SAGEATTN.') + scaled_dot_product_attention = sageattn + + +class CrossAttentionProcessor: + def __call__(self, attn, q, k, v): + out = scaled_dot_product_attention(q, k, v) + return out + + +class FlashVDMCrossAttentionProcessor: + def __init__(self, topk=None): + self.topk = topk + + def __call__(self, attn, q, k, v): + if k.shape[-2] == 3072: + topk = 1024 + elif k.shape[-2] == 512: + topk = 256 + else: + topk = k.shape[-2] // 3 + + if self.topk is True: + q1 = q[:, :, ::100, :] + sim = q1 @ k.transpose(-1, -2) + sim = torch.mean(sim, -2) + topk_ind = torch.topk(sim, dim=-1, k=topk).indices.squeeze(-2).unsqueeze(-1) + topk_ind = topk_ind.expand(-1, -1, -1, v.shape[-1]) + v0 = torch.gather(v, dim=-2, index=topk_ind) + k0 = torch.gather(k, dim=-2, index=topk_ind) + out = scaled_dot_product_attention(q, k0, v0) + elif self.topk is False: + out = scaled_dot_product_attention(q, k, v) + else: + idx, counts = self.topk + start = 0 + outs = [] + for grid_coord, count in zip(idx, counts): + end = start + count + q_chunk = q[:, :, start:end, :] + k0, v0 = self.select_topkv(q_chunk, k, v, topk) + out = scaled_dot_product_attention(q_chunk, k0, v0) + outs.append(out) + start += count + out = torch.cat(outs, dim=-2) + self.topk = False + return out + + def select_topkv(self, q_chunk, k, v, topk): + q1 = q_chunk[:, :, ::50, :] + sim = q1 @ k.transpose(-1, -2) + sim = torch.mean(sim, -2) + topk_ind = torch.topk(sim, dim=-1, k=topk).indices.squeeze(-2).unsqueeze(-1) + topk_ind = topk_ind.expand(-1, -1, -1, v.shape[-1]) + v0 = torch.gather(v, dim=-2, index=topk_ind) + k0 = torch.gather(k, dim=-2, index=topk_ind) + return k0, v0 + + +class FlashVDMTopMCrossAttentionProcessor(FlashVDMCrossAttentionProcessor): + def select_topkv(self, q_chunk, k, v, topk): + q1 = q_chunk[:, :, ::30, :] + sim = q1 @ k.transpose(-1, -2) + # sim = sim.to(torch.float32) + sim = sim.softmax(-1) + sim = torch.mean(sim, 1) + activated_token = torch.where(sim > 1e-6)[2] + index = torch.unique(activated_token, return_counts=True)[0].unsqueeze(0).unsqueeze(0).unsqueeze(-1) + index = index.expand(-1, v.shape[1], -1, v.shape[-1]) + v0 = torch.gather(v, dim=-2, index=index) + k0 = torch.gather(k, dim=-2, index=index) + return k0, v0 diff --git a/hy3dgen/shapegen/models/autoencoders/model.py b/hy3dgen/shapegen/models/autoencoders/model.py new file mode 100644 index 0000000..76f78da --- /dev/null +++ b/hy3dgen/shapegen/models/autoencoders/model.py @@ -0,0 +1,189 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import os + +import torch +import torch.nn as nn +import yaml + +from .attention_blocks import FourierEmbedder, Transformer, CrossAttentionDecoder +from .surface_extractors import MCSurfaceExtractor, SurfaceExtractors +from .volume_decoders import VanillaVolumeDecoder, FlashVDMVolumeDecoding, HierarchicalVolumeDecoding +from ...utils import logger, synchronize_timer, smart_load_model + + +class VectsetVAE(nn.Module): + + @classmethod + @synchronize_timer('VectsetVAE Model Loading') + def from_single_file( + cls, + ckpt_path, + config_path, + device='cuda', + dtype=torch.float16, + use_safetensors=None, + **kwargs, + ): + # load config + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + + # load ckpt + if use_safetensors: + ckpt_path = ckpt_path.replace('.ckpt', '.safetensors') + if not os.path.exists(ckpt_path): + raise FileNotFoundError(f"Model file {ckpt_path} not found") + + logger.info(f"Loading model from {ckpt_path}") + if use_safetensors: + import safetensors.torch + ckpt = safetensors.torch.load_file(ckpt_path, device='cpu') + else: + ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True) + + model_kwargs = config['params'] + model_kwargs.update(kwargs) + + model = cls(**model_kwargs) + model.load_state_dict(ckpt) + model.to(device=device, dtype=dtype) + return model + + @classmethod + def from_pretrained( + cls, + model_path, + device='cuda', + dtype=torch.float16, + use_safetensors=True, + variant='fp16', + subfolder='hunyuan3d-vae-v2-0', + **kwargs, + ): + config_path, ckpt_path = smart_load_model( + model_path, + subfolder=subfolder, + use_safetensors=use_safetensors, + variant=variant + ) + + return cls.from_single_file( + ckpt_path, + config_path, + device=device, + dtype=dtype, + use_safetensors=use_safetensors, + **kwargs + ) + + def __init__( + self, + volume_decoder=None, + surface_extractor=None + ): + super().__init__() + if volume_decoder is None: + volume_decoder = VanillaVolumeDecoder() + if surface_extractor is None: + surface_extractor = MCSurfaceExtractor() + self.volume_decoder = volume_decoder + self.surface_extractor = surface_extractor + + def latents2mesh(self, latents: torch.FloatTensor, **kwargs): + with synchronize_timer('Volume decoding'): + grid_logits = self.volume_decoder(latents, self.geo_decoder, **kwargs) + with synchronize_timer('Surface extraction'): + outputs = self.surface_extractor(grid_logits, **kwargs) + return outputs + + def enable_flashvdm_decoder( + self, + enabled: bool = True, + adaptive_kv_selection=True, + topk_mode='mean', + mc_algo='dmc', + ): + if enabled: + if adaptive_kv_selection: + self.volume_decoder = FlashVDMVolumeDecoding(topk_mode) + else: + self.volume_decoder = HierarchicalVolumeDecoding() + if mc_algo not in SurfaceExtractors.keys(): + raise ValueError(f'Unsupported mc_algo {mc_algo}, available: {list(SurfaceExtractors.keys())}') + self.surface_extractor = SurfaceExtractors[mc_algo]() + else: + self.volume_decoder = VanillaVolumeDecoder() + self.surface_extractor = MCSurfaceExtractor() + + +class ShapeVAE(VectsetVAE): + def __init__( + self, + *, + num_latents: int, + embed_dim: int, + width: int, + heads: int, + num_decoder_layers: int, + geo_decoder_downsample_ratio: int = 1, + geo_decoder_mlp_expand_ratio: int = 4, + geo_decoder_ln_post: bool = True, + num_freqs: int = 8, + include_pi: bool = True, + qkv_bias: bool = True, + qk_norm: bool = False, + label_type: str = "binary", + drop_path_rate: float = 0.0, + scale_factor: float = 1.0, + ): + super().__init__() + self.geo_decoder_ln_post = geo_decoder_ln_post + + self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi) + + self.post_kl = nn.Linear(embed_dim, width) + + self.transformer = Transformer( + n_ctx=num_latents, + width=width, + layers=num_decoder_layers, + heads=heads, + qkv_bias=qkv_bias, + qk_norm=qk_norm, + drop_path_rate=drop_path_rate + ) + + self.geo_decoder = CrossAttentionDecoder( + fourier_embedder=self.fourier_embedder, + out_channels=1, + num_latents=num_latents, + mlp_expand_ratio=geo_decoder_mlp_expand_ratio, + downsample_ratio=geo_decoder_downsample_ratio, + enable_ln_post=self.geo_decoder_ln_post, + width=width // geo_decoder_downsample_ratio, + heads=heads // geo_decoder_downsample_ratio, + qkv_bias=qkv_bias, + qk_norm=qk_norm, + label_type=label_type, + ) + + self.scale_factor = scale_factor + self.latent_shape = (num_latents, embed_dim) + + def forward(self, latents): + latents = self.post_kl(latents) + latents = self.transformer(latents) + return latents diff --git a/hy3dgen/shapegen/models/autoencoders/surface_extractors.py b/hy3dgen/shapegen/models/autoencoders/surface_extractors.py new file mode 100644 index 0000000..f4d8f63 --- /dev/null +++ b/hy3dgen/shapegen/models/autoencoders/surface_extractors.py @@ -0,0 +1,100 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from typing import Union, Tuple, List + +import numpy as np +import torch +from skimage import measure + + +class Latent2MeshOutput: + + def __init__(self, mesh_v=None, mesh_f=None): + self.mesh_v = mesh_v + self.mesh_f = mesh_f + + +def center_vertices(vertices): + """Translate the vertices so that bounding box is centered at zero.""" + vert_min = vertices.min(dim=0)[0] + vert_max = vertices.max(dim=0)[0] + vert_center = 0.5 * (vert_min + vert_max) + return vertices - vert_center + + +class SurfaceExtractor: + def _compute_box_stat(self, bounds: Union[Tuple[float], List[float], float], octree_resolution: int): + if isinstance(bounds, float): + bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds] + + bbox_min, bbox_max = np.array(bounds[0:3]), np.array(bounds[3:6]) + bbox_size = bbox_max - bbox_min + grid_size = [int(octree_resolution) + 1, int(octree_resolution) + 1, int(octree_resolution) + 1] + return grid_size, bbox_min, bbox_size + + def run(self, *args, **kwargs): + return NotImplementedError + + def __call__(self, grid_logits, **kwargs): + outputs = [] + for i in range(grid_logits.shape[0]): + try: + vertices, faces = self.run(grid_logits[i], **kwargs) + vertices = vertices.astype(np.float32) + faces = np.ascontiguousarray(faces) + outputs.append(Latent2MeshOutput(mesh_v=vertices, mesh_f=faces)) + + except Exception: + import traceback + traceback.print_exc() + outputs.append(None) + + return outputs + + +class MCSurfaceExtractor(SurfaceExtractor): + def run(self, grid_logit, *, mc_level, bounds, octree_resolution, **kwargs): + vertices, faces, normals, _ = measure.marching_cubes( + grid_logit.cpu().numpy(), + mc_level, + method="lewiner" + ) + grid_size, bbox_min, bbox_size = self._compute_box_stat(bounds, octree_resolution) + vertices = vertices / grid_size * bbox_size + bbox_min + return vertices, faces + + +class DMCSurfaceExtractor(SurfaceExtractor): + def run(self, grid_logit, *, octree_resolution, **kwargs): + device = grid_logit.device + if not hasattr(self, 'dmc'): + try: + from diso import DiffDMC + except: + raise ImportError("Please install diso via `pip install diso`, or set mc_algo to 'mc'") + self.dmc = DiffDMC(dtype=torch.float32).to(device) + sdf = -grid_logit / octree_resolution + sdf = sdf.to(torch.float32).contiguous() + verts, faces = self.dmc(sdf, deform=None, return_quads=False, normalize=True) + verts = center_vertices(verts) + vertices = verts.detach().cpu().numpy() + faces = faces.detach().cpu().numpy()[:, ::-1] + return vertices, faces + + +SurfaceExtractors = { + 'mc': MCSurfaceExtractor, + 'dmc': DMCSurfaceExtractor, +} diff --git a/hy3dgen/shapegen/models/autoencoders/volume_decoders.py b/hy3dgen/shapegen/models/autoencoders/volume_decoders.py new file mode 100644 index 0000000..d7bfd84 --- /dev/null +++ b/hy3dgen/shapegen/models/autoencoders/volume_decoders.py @@ -0,0 +1,435 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from typing import Union, Tuple, List, Callable + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import repeat +from tqdm import tqdm + +from .attention_blocks import CrossAttentionDecoder +from .attention_processors import FlashVDMCrossAttentionProcessor, FlashVDMTopMCrossAttentionProcessor +from ...utils import logger + + +def extract_near_surface_volume_fn(input_tensor: torch.Tensor, alpha: float): + device = input_tensor.device + D = input_tensor.shape[0] + signed_val = 0.0 + + # 添加偏移并处理无效值 + val = input_tensor + alpha + valid_mask = val > -9000 # 假设-9000是无效值 + + # 改进的邻居获取函数(保持维度一致) + def get_neighbor(t, shift, axis): + """根据指定轴进行位移并保持维度一致""" + if shift == 0: + return t.clone() + + # 确定填充轴(输入为[D, D, D]对应z,y,x轴) + pad_dims = [0, 0, 0, 0, 0, 0] # 格式:[x前,x后,y前,y后,z前,z后] + + # 根据轴类型设置填充 + if axis == 0: # x轴(最后一个维度) + pad_idx = 0 if shift > 0 else 1 + pad_dims[pad_idx] = abs(shift) + elif axis == 1: # y轴(中间维度) + pad_idx = 2 if shift > 0 else 3 + pad_dims[pad_idx] = abs(shift) + elif axis == 2: # z轴(第一个维度) + pad_idx = 4 if shift > 0 else 5 + pad_dims[pad_idx] = abs(shift) + + # 执行填充(添加batch和channel维度适配F.pad) + padded = F.pad(t.unsqueeze(0).unsqueeze(0), pad_dims[::-1], mode='replicate') # 反转顺序适配F.pad + + # 构建动态切片索引 + slice_dims = [slice(None)] * 3 # 初始化为全切片 + if axis == 0: # x轴(dim=2) + if shift > 0: + slice_dims[0] = slice(shift, None) + else: + slice_dims[0] = slice(None, shift) + elif axis == 1: # y轴(dim=1) + if shift > 0: + slice_dims[1] = slice(shift, None) + else: + slice_dims[1] = slice(None, shift) + elif axis == 2: # z轴(dim=0) + if shift > 0: + slice_dims[2] = slice(shift, None) + else: + slice_dims[2] = slice(None, shift) + + # 应用切片并恢复维度 + padded = padded.squeeze(0).squeeze(0) + sliced = padded[slice_dims] + return sliced + + # 获取各方向邻居(确保维度一致) + left = get_neighbor(val, 1, axis=0) # x方向 + right = get_neighbor(val, -1, axis=0) + back = get_neighbor(val, 1, axis=1) # y方向 + front = get_neighbor(val, -1, axis=1) + down = get_neighbor(val, 1, axis=2) # z方向 + up = get_neighbor(val, -1, axis=2) + + # 处理边界无效值(使用where保持维度一致) + def safe_where(neighbor): + return torch.where(neighbor > -9000, neighbor, val) + + left = safe_where(left) + right = safe_where(right) + back = safe_where(back) + front = safe_where(front) + down = safe_where(down) + up = safe_where(up) + + # 计算符号一致性(转换为float32确保精度) + sign = torch.sign(val.to(torch.float32)) + neighbors_sign = torch.stack([ + torch.sign(left.to(torch.float32)), + torch.sign(right.to(torch.float32)), + torch.sign(back.to(torch.float32)), + torch.sign(front.to(torch.float32)), + torch.sign(down.to(torch.float32)), + torch.sign(up.to(torch.float32)) + ], dim=0) + + # 检查所有符号是否一致 + same_sign = torch.all(neighbors_sign == sign, dim=0) + + # 生成最终掩码 + mask = (~same_sign).to(torch.int32) + return mask * valid_mask.to(torch.int32) + + +def generate_dense_grid_points( + bbox_min: np.ndarray, + bbox_max: np.ndarray, + octree_resolution: int, + indexing: str = "ij", +): + length = bbox_max - bbox_min + num_cells = octree_resolution + + x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32) + y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32) + z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32) + [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing) + xyz = np.stack((xs, ys, zs), axis=-1) + grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1] + + return xyz, grid_size, length + + +class VanillaVolumeDecoder: + @torch.no_grad() + def __call__( + self, + latents: torch.FloatTensor, + geo_decoder: Callable, + bounds: Union[Tuple[float], List[float], float] = 1.01, + num_chunks: int = 10000, + octree_resolution: int = None, + enable_pbar: bool = True, + **kwargs, + ): + device = latents.device + dtype = latents.dtype + batch_size = latents.shape[0] + + # 1. generate query points + if isinstance(bounds, float): + bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds] + + bbox_min, bbox_max = np.array(bounds[0:3]), np.array(bounds[3:6]) + xyz_samples, grid_size, length = generate_dense_grid_points( + bbox_min=bbox_min, + bbox_max=bbox_max, + octree_resolution=octree_resolution, + indexing="ij" + ) + xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype).contiguous().reshape(-1, 3) + + # 2. latents to 3d volume + batch_logits = [] + for start in tqdm(range(0, xyz_samples.shape[0], num_chunks), desc=f"Volume Decoding", + disable=not enable_pbar): + chunk_queries = xyz_samples[start: start + num_chunks, :] + chunk_queries = repeat(chunk_queries, "p c -> b p c", b=batch_size) + logits = geo_decoder(queries=chunk_queries, latents=latents) + batch_logits.append(logits) + + grid_logits = torch.cat(batch_logits, dim=1) + grid_logits = grid_logits.view((batch_size, *grid_size)).float() + + return grid_logits + + +class HierarchicalVolumeDecoding: + @torch.no_grad() + def __call__( + self, + latents: torch.FloatTensor, + geo_decoder: Callable, + bounds: Union[Tuple[float], List[float], float] = 1.01, + num_chunks: int = 10000, + mc_level: float = 0.0, + octree_resolution: int = None, + min_resolution: int = 63, + enable_pbar: bool = True, + **kwargs, + ): + device = latents.device + dtype = latents.dtype + + resolutions = [] + if octree_resolution < min_resolution: + resolutions.append(octree_resolution) + while octree_resolution >= min_resolution: + resolutions.append(octree_resolution) + octree_resolution = octree_resolution // 2 + resolutions.reverse() + + # 1. generate query points + if isinstance(bounds, float): + bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds] + bbox_min = np.array(bounds[0:3]) + bbox_max = np.array(bounds[3:6]) + bbox_size = bbox_max - bbox_min + + xyz_samples, grid_size, length = generate_dense_grid_points( + bbox_min=bbox_min, + bbox_max=bbox_max, + octree_resolution=resolutions[0], + indexing="ij" + ) + + dilate = nn.Conv3d(1, 1, 3, padding=1, bias=False, device=device, dtype=dtype) + dilate.weight = torch.nn.Parameter(torch.ones(dilate.weight.shape, dtype=dtype, device=device)) + + grid_size = np.array(grid_size) + xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype).contiguous().reshape(-1, 3) + + # 2. latents to 3d volume + batch_logits = [] + batch_size = latents.shape[0] + for start in tqdm(range(0, xyz_samples.shape[0], num_chunks), + desc=f"Hierarchical Volume Decoding [r{resolutions[0] + 1}]"): + queries = xyz_samples[start: start + num_chunks, :] + batch_queries = repeat(queries, "p c -> b p c", b=batch_size) + logits = geo_decoder(queries=batch_queries, latents=latents) + batch_logits.append(logits) + + grid_logits = torch.cat(batch_logits, dim=1).view((batch_size, grid_size[0], grid_size[1], grid_size[2])) + + for octree_depth_now in resolutions[1:]: + grid_size = np.array([octree_depth_now + 1] * 3) + resolution = bbox_size / octree_depth_now + next_index = torch.zeros(tuple(grid_size), dtype=dtype, device=device) + next_logits = torch.full(next_index.shape, -10000., dtype=dtype, device=device) + curr_points = extract_near_surface_volume_fn(grid_logits.squeeze(0), mc_level) + curr_points += grid_logits.squeeze(0).abs() < 0.95 + + if octree_depth_now == resolutions[-1]: + expand_num = 0 + else: + expand_num = 1 + for i in range(expand_num): + curr_points = dilate(curr_points.unsqueeze(0).to(dtype)).squeeze(0) + (cidx_x, cidx_y, cidx_z) = torch.where(curr_points > 0) + next_index[cidx_x * 2, cidx_y * 2, cidx_z * 2] = 1 + for i in range(2 - expand_num): + next_index = dilate(next_index.unsqueeze(0)).squeeze(0) + nidx = torch.where(next_index > 0) + + next_points = torch.stack(nidx, dim=1) + next_points = (next_points * torch.tensor(resolution, dtype=next_points.dtype, device=device) + + torch.tensor(bbox_min, dtype=next_points.dtype, device=device)) + batch_logits = [] + for start in tqdm(range(0, next_points.shape[0], num_chunks), + desc=f"Hierarchical Volume Decoding [r{octree_depth_now + 1}]"): + queries = next_points[start: start + num_chunks, :] + batch_queries = repeat(queries, "p c -> b p c", b=batch_size) + logits = geo_decoder(queries=batch_queries.to(latents.dtype), latents=latents) + batch_logits.append(logits) + grid_logits = torch.cat(batch_logits, dim=1) + next_logits[nidx] = grid_logits[0, ..., 0] + grid_logits = next_logits.unsqueeze(0) + grid_logits[grid_logits == -10000.] = float('nan') + + return grid_logits + + +class FlashVDMVolumeDecoding: + def __init__(self, topk_mode='mean'): + if topk_mode not in ['mean', 'merge']: + raise ValueError(f'Unsupported topk_mode {topk_mode}, available: {["mean", "merge"]}') + + if topk_mode == 'mean': + self.processor = FlashVDMCrossAttentionProcessor() + else: + self.processor = FlashVDMTopMCrossAttentionProcessor() + + @torch.no_grad() + def __call__( + self, + latents: torch.FloatTensor, + geo_decoder: CrossAttentionDecoder, + bounds: Union[Tuple[float], List[float], float] = 1.01, + num_chunks: int = 10000, + mc_level: float = 0.0, + octree_resolution: int = None, + min_resolution: int = 63, + mini_grid_num: int = 4, + enable_pbar: bool = True, + **kwargs, + ): + processor = self.processor + geo_decoder.set_cross_attention_processor(processor) + + device = latents.device + dtype = latents.dtype + + resolutions = [] + if octree_resolution < min_resolution: + resolutions.append(octree_resolution) + while octree_resolution >= min_resolution: + resolutions.append(octree_resolution) + octree_resolution = octree_resolution // 2 + resolutions.reverse() + resolutions[0] = round(resolutions[0] / mini_grid_num) * mini_grid_num - 1 + for i, resolution in enumerate(resolutions[1:]): + resolutions[i + 1] = resolutions[0] * 2 ** (i + 1) + + logger.info(f"FlashVDMVolumeDecoding Resolution: {resolutions}") + + # 1. generate query points + if isinstance(bounds, float): + bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds] + bbox_min = np.array(bounds[0:3]) + bbox_max = np.array(bounds[3:6]) + bbox_size = bbox_max - bbox_min + + xyz_samples, grid_size, length = generate_dense_grid_points( + bbox_min=bbox_min, + bbox_max=bbox_max, + octree_resolution=resolutions[0], + indexing="ij" + ) + + dilate = nn.Conv3d(1, 1, 3, padding=1, bias=False, device=device, dtype=dtype) + dilate.weight = torch.nn.Parameter(torch.ones(dilate.weight.shape, dtype=dtype, device=device)) + + grid_size = np.array(grid_size) + + # 2. latents to 3d volume + xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype) + batch_size = latents.shape[0] + mini_grid_size = xyz_samples.shape[0] // mini_grid_num + xyz_samples = xyz_samples.view( + mini_grid_num, mini_grid_size, + mini_grid_num, mini_grid_size, + mini_grid_num, mini_grid_size, 3 + ).permute( + 0, 2, 4, 1, 3, 5, 6 + ).reshape( + -1, mini_grid_size * mini_grid_size * mini_grid_size, 3 + ) + batch_logits = [] + num_batchs = max(num_chunks // xyz_samples.shape[1], 1) + for start in tqdm(range(0, xyz_samples.shape[0], num_batchs), + desc=f"FlashVDM Volume Decoding", disable=not enable_pbar): + queries = xyz_samples[start: start + num_batchs, :] + batch = queries.shape[0] + batch_latents = repeat(latents.squeeze(0), "p c -> b p c", b=batch) + processor.topk = True + logits = geo_decoder(queries=queries, latents=batch_latents) + batch_logits.append(logits) + grid_logits = torch.cat(batch_logits, dim=0).reshape( + mini_grid_num, mini_grid_num, mini_grid_num, + mini_grid_size, mini_grid_size, + mini_grid_size + ).permute(0, 3, 1, 4, 2, 5).contiguous().view( + (batch_size, grid_size[0], grid_size[1], grid_size[2]) + ) + + for octree_depth_now in resolutions[1:]: + grid_size = np.array([octree_depth_now + 1] * 3) + resolution = bbox_size / octree_depth_now + next_index = torch.zeros(tuple(grid_size), dtype=dtype, device=device) + next_logits = torch.full(next_index.shape, -10000., dtype=dtype, device=device) + curr_points = extract_near_surface_volume_fn(grid_logits.squeeze(0), mc_level) + curr_points += grid_logits.squeeze(0).abs() < 0.95 + + if octree_depth_now == resolutions[-1]: + expand_num = 0 + else: + expand_num = 1 + for i in range(expand_num): + curr_points = dilate(curr_points.unsqueeze(0).to(dtype)).squeeze(0) + (cidx_x, cidx_y, cidx_z) = torch.where(curr_points > 0) + + next_index[cidx_x * 2, cidx_y * 2, cidx_z * 2] = 1 + for i in range(2 - expand_num): + next_index = dilate(next_index.unsqueeze(0)).squeeze(0) + nidx = torch.where(next_index > 0) + + next_points = torch.stack(nidx, dim=1) + next_points = (next_points * torch.tensor(resolution, dtype=torch.float32, device=device) + + torch.tensor(bbox_min, dtype=torch.float32, device=device)) + + query_grid_num = 6 + min_val = next_points.min(axis=0).values + max_val = next_points.max(axis=0).values + vol_queries_index = (next_points - min_val) / (max_val - min_val) * (query_grid_num - 0.001) + index = torch.floor(vol_queries_index).long() + index = index[..., 0] * (query_grid_num ** 2) + index[..., 1] * query_grid_num + index[..., 2] + index = index.sort() + next_points = next_points[index.indices].unsqueeze(0).contiguous() + unique_values = torch.unique(index.values, return_counts=True) + grid_logits = torch.zeros((next_points.shape[1]), dtype=latents.dtype, device=latents.device) + input_grid = [[], []] + logits_grid_list = [] + start_num = 0 + sum_num = 0 + for grid_index, count in zip(unique_values[0].cpu().tolist(), unique_values[1].cpu().tolist()): + if sum_num + count < num_chunks or sum_num == 0: + sum_num += count + input_grid[0].append(grid_index) + input_grid[1].append(count) + else: + processor.topk = input_grid + logits_grid = geo_decoder(queries=next_points[:, start_num:start_num + sum_num], latents=latents) + start_num = start_num + sum_num + logits_grid_list.append(logits_grid) + input_grid = [[grid_index], [count]] + sum_num = count + if sum_num > 0: + processor.topk = input_grid + logits_grid = geo_decoder(queries=next_points[:, start_num:start_num + sum_num], latents=latents) + logits_grid_list.append(logits_grid) + logits_grid = torch.cat(logits_grid_list, dim=1) + grid_logits[index.indices] = logits_grid.squeeze(0).squeeze(-1) + next_logits[nidx] = grid_logits + grid_logits = next_logits.unsqueeze(0) + + grid_logits[grid_logits == -10000.] = float('nan') + + return grid_logits diff --git a/hy3dgen/shapegen/models/conditioner.py b/hy3dgen/shapegen/models/conditioner.py new file mode 100644 index 0000000..d0d848c --- /dev/null +++ b/hy3dgen/shapegen/models/conditioner.py @@ -0,0 +1,257 @@ +# Open Source Model Licensed under the Apache License Version 2.0 +# and Other Licenses of the Third-Party Components therein: +# The below Model in this distribution may have been modified by THL A29 Limited +# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited. + +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# The below software and/or models in this distribution may have been +# modified by THL A29 Limited ("Tencent Modifications"). +# All Tencent Modifications are Copyright (C) THL A29 Limited. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import numpy as np +import torch +import torch.nn as nn +from torchvision import transforms +from transformers import ( + CLIPVisionModelWithProjection, + CLIPVisionConfig, + Dinov2Model, + Dinov2Config, +) + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float64) + omega /= embed_dim / 2. + omega = 1. / 10000 ** omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + return np.concatenate([emb_sin, emb_cos], axis=1) + + +class ImageEncoder(nn.Module): + def __init__( + self, + version=None, + config=None, + use_cls_token=True, + image_size=224, + **kwargs, + ): + super().__init__() + + if config is None: + self.model = self.MODEL_CLASS.from_pretrained(version) + else: + self.model = self.MODEL_CLASS(self.MODEL_CONFIG_CLASS.from_dict(config)) + self.model.eval() + self.model.requires_grad_(False) + self.use_cls_token = use_cls_token + self.size = image_size // 14 + self.num_patches = (image_size // 14) ** 2 + if self.use_cls_token: + self.num_patches += 1 + + self.transform = transforms.Compose( + [ + transforms.Resize(image_size, transforms.InterpolationMode.BILINEAR, antialias=True), + transforms.CenterCrop(image_size), + transforms.Normalize( + mean=self.mean, + std=self.std, + ), + ] + ) + + def forward(self, image, mask=None, value_range=(-1, 1), **kwargs): + if value_range is not None: + low, high = value_range + image = (image - low) / (high - low) + + image = image.to(self.model.device, dtype=self.model.dtype) + inputs = self.transform(image) + outputs = self.model(inputs) + + last_hidden_state = outputs.last_hidden_state + if not self.use_cls_token: + last_hidden_state = last_hidden_state[:, 1:, :] + + return last_hidden_state + + def unconditional_embedding(self, batch_size, **kwargs): + device = next(self.model.parameters()).device + dtype = next(self.model.parameters()).dtype + zero = torch.zeros( + batch_size, + self.num_patches, + self.model.config.hidden_size, + device=device, + dtype=dtype, + ) + + return zero + + +class CLIPImageEncoder(ImageEncoder): + MODEL_CLASS = CLIPVisionModelWithProjection + MODEL_CONFIG_CLASS = CLIPVisionConfig + mean = [0.48145466, 0.4578275, 0.40821073] + std = [0.26862954, 0.26130258, 0.27577711] + + +class DinoImageEncoder(ImageEncoder): + MODEL_CLASS = Dinov2Model + MODEL_CONFIG_CLASS = Dinov2Config + mean = [0.485, 0.456, 0.406] + std = [0.229, 0.224, 0.225] + + +class DinoImageEncoderMV(DinoImageEncoder): + def __init__( + self, + version=None, + config=None, + use_cls_token=True, + image_size=224, + view_num=4, + **kwargs, + ): + super().__init__(version, config, use_cls_token, image_size, **kwargs) + self.view_num = view_num + self.num_patches = self.num_patches + pos = np.arange(self.view_num, dtype=np.float32) + view_embedding = torch.from_numpy( + get_1d_sincos_pos_embed_from_grid(self.model.config.hidden_size, pos)).float() + + view_embedding = view_embedding.unsqueeze(1).repeat(1, self.num_patches, 1) + self.view_embed = view_embedding.unsqueeze(0) + + def forward(self, image, mask=None, value_range=(-1, 1), view_idxs=None): + if value_range is not None: + low, high = value_range + image = (image - low) / (high - low) + + image = image.to(self.model.device, dtype=self.model.dtype) + + bs, num_views, c, h, w = image.shape + image = image.view(bs * num_views, c, h, w) + + inputs = self.transform(image) + outputs = self.model(inputs) + + last_hidden_state = outputs.last_hidden_state + last_hidden_state = last_hidden_state.view( + bs, num_views, last_hidden_state.shape[-2], + last_hidden_state.shape[-1] + ) + + view_embedding = self.view_embed.to(last_hidden_state.dtype).to(last_hidden_state.device) + if view_idxs is not None: + assert len(view_idxs) == bs + view_embeddings = [] + for i in range(bs): + view_idx = view_idxs[i] + assert num_views == len(view_idx) + view_embeddings.append(self.view_embed[:, view_idx, ...]) + view_embedding = torch.cat(view_embeddings, 0).to(last_hidden_state.dtype).to(last_hidden_state.device) + + if num_views != self.view_num: + view_embedding = view_embedding[:, :num_views, ...] + last_hidden_state = last_hidden_state + view_embedding + last_hidden_state = last_hidden_state.view(bs, num_views * last_hidden_state.shape[-2], + last_hidden_state.shape[-1]) + return last_hidden_state + + def unconditional_embedding(self, batch_size, view_idxs=None, **kwargs): + device = next(self.model.parameters()).device + dtype = next(self.model.parameters()).dtype + zero = torch.zeros( + batch_size, + self.num_patches * len(view_idxs[0]), + self.model.config.hidden_size, + device=device, + dtype=dtype, + ) + return zero + + +def build_image_encoder(config): + if config['type'] == 'CLIPImageEncoder': + return CLIPImageEncoder(**config['kwargs']) + elif config['type'] == 'DinoImageEncoder': + return DinoImageEncoder(**config['kwargs']) + elif config['type'] == 'DinoImageEncoderMV': + return DinoImageEncoderMV(**config['kwargs']) + else: + raise ValueError(f'Unknown image encoder type: {config["type"]}') + + +class DualImageEncoder(nn.Module): + def __init__( + self, + main_image_encoder, + additional_image_encoder, + ): + super().__init__() + self.main_image_encoder = build_image_encoder(main_image_encoder) + self.additional_image_encoder = build_image_encoder(additional_image_encoder) + + def forward(self, image, mask=None, **kwargs): + outputs = { + 'main': self.main_image_encoder(image, mask=mask, **kwargs), + 'additional': self.additional_image_encoder(image, mask=mask, **kwargs), + } + return outputs + + def unconditional_embedding(self, batch_size, **kwargs): + outputs = { + 'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs), + 'additional': self.additional_image_encoder.unconditional_embedding(batch_size, **kwargs), + } + return outputs + + +class SingleImageEncoder(nn.Module): + def __init__( + self, + main_image_encoder, + ): + super().__init__() + self.main_image_encoder = build_image_encoder(main_image_encoder) + + def forward(self, image, mask=None, **kwargs): + outputs = { + 'main': self.main_image_encoder(image, mask=mask, **kwargs), + } + return outputs + + def unconditional_embedding(self, batch_size, **kwargs): + outputs = { + 'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs), + } + return outputs diff --git a/hy3dgen/shapegen/models/denoisers/__init__.py b/hy3dgen/shapegen/models/denoisers/__init__.py new file mode 100644 index 0000000..7260933 --- /dev/null +++ b/hy3dgen/shapegen/models/denoisers/__init__.py @@ -0,0 +1,15 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from .hunyuan3ddit import Hunyuan3DDiT diff --git a/hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py b/hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py new file mode 100644 index 0000000..7873f16 --- /dev/null +++ b/hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py @@ -0,0 +1,410 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import math +import os +from dataclasses import dataclass +from typing import List, Tuple, Optional + +import torch +from einops import rearrange +from torch import Tensor, nn + +scaled_dot_product_attention = nn.functional.scaled_dot_product_attention +if os.environ.get('USE_SAGEATTN', '0') == '1': + try: + from sageattention import sageattn + except ImportError: + raise ImportError('Please install the package "sageattention" to use this USE_SAGEATTN.') + scaled_dot_product_attention = sageattn + + +def attention(q: Tensor, k: Tensor, v: Tensor, **kwargs) -> Tensor: + x = scaled_dot_product_attention(q, k, v) + x = rearrange(x, "B H L D -> B L (H D)") + return x + + +def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0): + """ + Create sinusoidal timestep embeddings. + :param t: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an (N, D) Tensor of positional embeddings. + """ + t = time_factor * t + half = dim // 2 + freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to( + t.device + ) + + args = t[:, None].float() * freqs[None] + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + if dim % 2: + embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) + if torch.is_floating_point(t): + embedding = embedding.to(t) + return embedding + + +class GELU(nn.Module): + def __init__(self, approximate='tanh'): + super().__init__() + self.approximate = approximate + + def forward(self, x: Tensor) -> Tensor: + return nn.functional.gelu(x.contiguous(), approximate=self.approximate) + + +class MLPEmbedder(nn.Module): + def __init__(self, in_dim: int, hidden_dim: int): + super().__init__() + self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True) + self.silu = nn.SiLU() + self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True) + + def forward(self, x: Tensor) -> Tensor: + return self.out_layer(self.silu(self.in_layer(x))) + + +class RMSNorm(torch.nn.Module): + def __init__(self, dim: int): + super().__init__() + self.scale = nn.Parameter(torch.ones(dim)) + + def forward(self, x: Tensor): + x_dtype = x.dtype + x = x.float() + rrms = torch.rsqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + 1e-6) + return (x * rrms).to(dtype=x_dtype) * self.scale + + +class QKNorm(torch.nn.Module): + def __init__(self, dim: int): + super().__init__() + self.query_norm = RMSNorm(dim) + self.key_norm = RMSNorm(dim) + + def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tuple[Tensor, Tensor]: + q = self.query_norm(q) + k = self.key_norm(k) + return q.to(v), k.to(v) + + +class SelfAttention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.norm = QKNorm(head_dim) + self.proj = nn.Linear(dim, dim) + + def forward(self, x: Tensor, pe: Tensor) -> Tensor: + qkv = self.qkv(x) + q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) + q, k = self.norm(q, k, v) + x = attention(q, k, v, pe=pe) + x = self.proj(x) + return x + + +@dataclass +class ModulationOut: + shift: Tensor + scale: Tensor + gate: Tensor + + +class Modulation(nn.Module): + def __init__(self, dim: int, double: bool): + super().__init__() + self.is_double = double + self.multiplier = 6 if double else 3 + self.lin = nn.Linear(dim, self.multiplier * dim, bias=True) + + def forward(self, vec: Tensor) -> Tuple[ModulationOut, Optional[ModulationOut]]: + out = self.lin(nn.functional.silu(vec))[:, None, :] + out = out.chunk(self.multiplier, dim=-1) + + return ( + ModulationOut(*out[:3]), + ModulationOut(*out[3:]) if self.is_double else None, + ) + + +class DoubleStreamBlock(nn.Module): + def __init__( + self, + hidden_size: int, + num_heads: int, + mlp_ratio: float, + qkv_bias: bool = False, + ): + super().__init__() + mlp_hidden_dim = int(hidden_size * mlp_ratio) + self.num_heads = num_heads + self.hidden_size = hidden_size + self.img_mod = Modulation(hidden_size, double=True) + self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias) + + self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.img_mlp = nn.Sequential( + nn.Linear(hidden_size, mlp_hidden_dim, bias=True), + GELU(approximate="tanh"), + nn.Linear(mlp_hidden_dim, hidden_size, bias=True), + ) + + self.txt_mod = Modulation(hidden_size, double=True) + self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias) + + self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.txt_mlp = nn.Sequential( + nn.Linear(hidden_size, mlp_hidden_dim, bias=True), + GELU(approximate="tanh"), + nn.Linear(mlp_hidden_dim, hidden_size, bias=True), + ) + + def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> Tuple[Tensor, Tensor]: + img_mod1, img_mod2 = self.img_mod(vec) + txt_mod1, txt_mod2 = self.txt_mod(vec) + + img_modulated = self.img_norm1(img) + img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift + img_qkv = self.img_attn.qkv(img_modulated) + img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) + img_q, img_k = self.img_attn.norm(img_q, img_k, img_v) + + txt_modulated = self.txt_norm1(txt) + txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift + txt_qkv = self.txt_attn.qkv(txt_modulated) + txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) + txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v) + + q = torch.cat((txt_q, img_q), dim=2) + k = torch.cat((txt_k, img_k), dim=2) + v = torch.cat((txt_v, img_v), dim=2) + + attn = attention(q, k, v, pe=pe) + txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:] + + img = img + img_mod1.gate * self.img_attn.proj(img_attn) + img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift) + + txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn) + txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift) + return img, txt + + +class SingleStreamBlock(nn.Module): + """ + A DiT block with parallel linear layers as described in + https://arxiv.org/abs/2302.05442 and adapted modulation interface. + """ + + def __init__( + self, + hidden_size: int, + num_heads: int, + mlp_ratio: float = 4.0, + qk_scale: Optional[float] = None, + ): + super().__init__() + + self.hidden_dim = hidden_size + self.num_heads = num_heads + head_dim = hidden_size // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.mlp_hidden_dim = int(hidden_size * mlp_ratio) + # qkv and mlp_in + self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim) + # proj and mlp_out + self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size) + + self.norm = QKNorm(head_dim) + + self.hidden_size = hidden_size + self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + + self.mlp_act = GELU(approximate="tanh") + self.modulation = Modulation(hidden_size, double=False) + + def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor: + mod, _ = self.modulation(vec) + + x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift + qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1) + + q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads) + q, k = self.norm(q, k, v) + + # compute attention + attn = attention(q, k, v, pe=pe) + # compute activation in mlp stream, cat again and run second linear layer + output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)) + return x + mod.gate * output + + +class LastLayer(nn.Module): + def __init__(self, hidden_size: int, patch_size: int, out_channels: int): + super().__init__() + self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True) + self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)) + + def forward(self, x: Tensor, vec: Tensor) -> Tensor: + shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1) + x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :] + x = self.linear(x) + return x + + +class Hunyuan3DDiT(nn.Module): + def __init__( + self, + in_channels: int = 64, + context_in_dim: int = 1536, + hidden_size: int = 1024, + mlp_ratio: float = 4.0, + num_heads: int = 16, + depth: int = 16, + depth_single_blocks: int = 32, + axes_dim: List[int] = [64], + theta: int = 10_000, + qkv_bias: bool = True, + time_factor: float = 1000, + guidance_embed: bool = False, + ckpt_path: Optional[str] = None, + **kwargs, + ): + super().__init__() + self.in_channels = in_channels + self.context_in_dim = context_in_dim + self.hidden_size = hidden_size + self.mlp_ratio = mlp_ratio + self.num_heads = num_heads + self.depth = depth + self.depth_single_blocks = depth_single_blocks + self.axes_dim = axes_dim + self.theta = theta + self.qkv_bias = qkv_bias + self.time_factor = time_factor + self.out_channels = self.in_channels + self.guidance_embed = guidance_embed + + if hidden_size % num_heads != 0: + raise ValueError( + f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}" + ) + pe_dim = hidden_size // num_heads + if sum(axes_dim) != pe_dim: + raise ValueError(f"Got {axes_dim} but expected positional dim {pe_dim}") + self.hidden_size = hidden_size + self.num_heads = num_heads + self.latent_in = nn.Linear(self.in_channels, self.hidden_size, bias=True) + self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) + self.cond_in = nn.Linear(context_in_dim, self.hidden_size) + self.guidance_in = ( + MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if guidance_embed else nn.Identity() + ) + + self.double_blocks = nn.ModuleList( + [ + DoubleStreamBlock( + self.hidden_size, + self.num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + ) + for _ in range(depth) + ] + ) + + self.single_blocks = nn.ModuleList( + [ + SingleStreamBlock( + self.hidden_size, + self.num_heads, + mlp_ratio=mlp_ratio, + ) + for _ in range(depth_single_blocks) + ] + ) + + self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels) + + if ckpt_path is not None: + print('restored denoiser ckpt', ckpt_path) + + ckpt = torch.load(ckpt_path, map_location="cpu") + if 'state_dict' not in ckpt: + # deepspeed ckpt + state_dict = {} + for k in ckpt.keys(): + new_k = k.replace('_forward_module.', '') + state_dict[new_k] = ckpt[k] + else: + state_dict = ckpt["state_dict"] + + final_state_dict = {} + for k, v in state_dict.items(): + if k.startswith('model.'): + final_state_dict[k.replace('model.', '')] = v + else: + final_state_dict[k] = v + missing, unexpected = self.load_state_dict(final_state_dict, strict=False) + print('unexpected keys:', unexpected) + print('missing keys:', missing) + + def forward( + self, + x, + t, + contexts, + **kwargs, + ) -> Tensor: + cond = contexts['main'] + latent = self.latent_in(x) + + vec = self.time_in(timestep_embedding(t, 256, self.time_factor).to(dtype=latent.dtype)) + if self.guidance_embed: + guidance = kwargs.get('guidance', None) + if guidance is None: + raise ValueError("Didn't get guidance strength for guidance distilled model.") + vec = vec + self.guidance_in(timestep_embedding(guidance, 256, self.time_factor)) + + cond = self.cond_in(cond) + pe = None + + for block in self.double_blocks: + latent, cond = block(img=latent, txt=cond, vec=vec, pe=pe) + + latent = torch.cat((cond, latent), 1) + for block in self.single_blocks: + latent = block(latent, vec=vec, pe=pe) + + latent = latent[:, cond.shape[1]:, ...] + latent = self.final_layer(latent, vec) + return latent diff --git a/hy3dgen/shapegen/pipelines.py b/hy3dgen/shapegen/pipelines.py new file mode 100644 index 0000000..09108a7 --- /dev/null +++ b/hy3dgen/shapegen/pipelines.py @@ -0,0 +1,765 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import copy +import importlib +import inspect +import os +from typing import List, Optional, Union + +import numpy as np +import torch +import trimesh +import yaml +from PIL import Image +from diffusers.utils.torch_utils import randn_tensor +from diffusers.utils.import_utils import is_accelerate_version, is_accelerate_available +from tqdm import tqdm + +from .models.autoencoders import ShapeVAE +from .models.autoencoders import SurfaceExtractors +from .utils import logger, synchronize_timer, smart_load_model + + +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, + **kwargs, +): + """ + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. + + Returns: + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + +@synchronize_timer('Export to trimesh') +def export_to_trimesh(mesh_output): + if isinstance(mesh_output, list): + outputs = [] + for mesh in mesh_output: + if mesh is None: + outputs.append(None) + else: + mesh.mesh_f = mesh.mesh_f[:, ::-1] + mesh_output = trimesh.Trimesh(mesh.mesh_v, mesh.mesh_f) + outputs.append(mesh_output) + return outputs + else: + mesh_output.mesh_f = mesh_output.mesh_f[:, ::-1] + mesh_output = trimesh.Trimesh(mesh_output.mesh_v, mesh_output.mesh_f) + return mesh_output + + +def get_obj_from_str(string, reload=False): + module, cls = string.rsplit(".", 1) + if reload: + module_imp = importlib.import_module(module) + importlib.reload(module_imp) + return getattr(importlib.import_module(module, package=None), cls) + + +def instantiate_from_config(config, **kwargs): + if "target" not in config: + raise KeyError("Expected key `target` to instantiate.") + cls = get_obj_from_str(config["target"]) + params = config.get("params", dict()) + kwargs.update(params) + instance = cls(**kwargs) + return instance + + +class Hunyuan3DDiTPipeline: + model_cpu_offload_seq = "conditioner->model->vae" + _exclude_from_cpu_offload = [] + + @classmethod + @synchronize_timer('Hunyuan3DDiTPipeline Model Loading') + def from_single_file( + cls, + ckpt_path, + config_path, + device='cuda', + dtype=torch.float16, + use_safetensors=None, + **kwargs, + ): + # load config + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + + # load ckpt + if use_safetensors: + ckpt_path = ckpt_path.replace('.ckpt', '.safetensors') + if not os.path.exists(ckpt_path): + raise FileNotFoundError(f"Model file {ckpt_path} not found") + logger.info(f"Loading model from {ckpt_path}") + + if use_safetensors: + # parse safetensors + import safetensors.torch + safetensors_ckpt = safetensors.torch.load_file(ckpt_path, device='cpu') + ckpt = {} + for key, value in safetensors_ckpt.items(): + model_name = key.split('.')[0] + new_key = key[len(model_name) + 1:] + if model_name not in ckpt: + ckpt[model_name] = {} + ckpt[model_name][new_key] = value + else: + ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True) + # load model + model = instantiate_from_config(config['model']) + model.load_state_dict(ckpt['model']) + vae = instantiate_from_config(config['vae']) + vae.load_state_dict(ckpt['vae']) + conditioner = instantiate_from_config(config['conditioner']) + if 'conditioner' in ckpt: + conditioner.load_state_dict(ckpt['conditioner']) + image_processor = instantiate_from_config(config['image_processor']) + scheduler = instantiate_from_config(config['scheduler']) + + model_kwargs = dict( + vae=vae, + model=model, + scheduler=scheduler, + conditioner=conditioner, + image_processor=image_processor, + device=device, + dtype=dtype, + ) + model_kwargs.update(kwargs) + + return cls( + **model_kwargs + ) + + @classmethod + def from_pretrained( + cls, + model_path, + device='cuda', + dtype=torch.float16, + use_safetensors=True, + variant='fp16', + subfolder='hunyuan3d-dit-v2-0', + **kwargs, + ): + kwargs['from_pretrained_kwargs'] = dict( + model_path=model_path, + subfolder=subfolder, + use_safetensors=use_safetensors, + variant=variant, + dtype=dtype, + device=device, + ) + config_path, ckpt_path = smart_load_model( + model_path, + subfolder=subfolder, + use_safetensors=use_safetensors, + variant=variant + ) + return cls.from_single_file( + ckpt_path, + config_path, + device=device, + dtype=dtype, + use_safetensors=use_safetensors, + **kwargs + ) + + def __init__( + self, + vae, + model, + scheduler, + conditioner, + image_processor, + device='cuda', + dtype=torch.float16, + **kwargs + ): + self.vae = vae + self.model = model + self.scheduler = scheduler + self.conditioner = conditioner + self.image_processor = image_processor + self.kwargs = kwargs + self.to(device, dtype) + + def compile(self): + self.vae = torch.compile(self.vae) + self.model = torch.compile(self.model) + self.conditioner = torch.compile(self.conditioner) + + def enable_flashvdm( + self, + enabled: bool = True, + adaptive_kv_selection=True, + topk_mode='mean', + mc_algo='mc', + replace_vae=True, + ): + if enabled: + model_path = self.kwargs['from_pretrained_kwargs']['model_path'] + turbo_vae_mapping = { + 'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'), + 'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'), + 'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini-turbo'), + } + model_name = model_path.split('/')[-1] + if replace_vae and model_name in turbo_vae_mapping: + model_path, subfolder = turbo_vae_mapping[model_name] + self.vae = ShapeVAE.from_pretrained( + model_path, subfolder=subfolder, + use_safetensors=self.kwargs['from_pretrained_kwargs']['use_safetensors'], + device=self.device, + ) + self.vae.enable_flashvdm_decoder( + enabled=enabled, + adaptive_kv_selection=adaptive_kv_selection, + topk_mode=topk_mode, + mc_algo=mc_algo + ) + else: + model_path = self.kwargs['from_pretrained_kwargs']['model_path'] + vae_mapping = { + 'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'), + 'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'), + 'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini'), + } + model_name = model_path.split('/')[-1] + if model_name in vae_mapping: + model_path, subfolder = vae_mapping[model_name] + self.vae = ShapeVAE.from_pretrained(model_path, subfolder=subfolder) + self.vae.enable_flashvdm_decoder(enabled=False) + + def to(self, device=None, dtype=None): + if dtype is not None: + self.dtype = dtype + self.vae.to(dtype=dtype) + self.model.to(dtype=dtype) + self.conditioner.to(dtype=dtype) + if device is not None: + self.device = torch.device(device) + self.vae.to(device) + self.model.to(device) + self.conditioner.to(device) + + @property + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + [`~DiffusionPipeline.enable_sequential_cpu_offload`] the execution device can only be inferred from + Accelerate's module hooks. + """ + for name, model in self.components.items(): + if not isinstance(model, torch.nn.Module) or name in self._exclude_from_cpu_offload: + continue + + if not hasattr(model, "_hf_hook"): + return self.device + for module in model.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + + Arguments: + gpu_id (`int`, *optional*): + The ID of the accelerator that shall be used in inference. If not specified, it will default to 0. + device (`torch.Device` or `str`, *optional*, defaults to "cuda"): + The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will + default to "cuda". + """ + if self.model_cpu_offload_seq is None: + raise ValueError( + "Model CPU offload cannot be enabled because no `model_cpu_offload_seq` class attribute is set." + ) + + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + torch_device = torch.device(device) + device_index = torch_device.index + + if gpu_id is not None and device_index is not None: + raise ValueError( + f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}" + f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}" + ) + + # _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0 + self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0) + + device_type = torch_device.type + device = torch.device(f"{device_type}:{self._offload_gpu_id}") + + if self.device.type != "cpu": + self.to("cpu") + device_mod = getattr(torch, self.device.type, None) + if hasattr(device_mod, "empty_cache") and device_mod.is_available(): + device_mod.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)} + + self._all_hooks = [] + hook = None + for model_str in self.model_cpu_offload_seq.split("->"): + model = all_model_components.pop(model_str, None) + if not isinstance(model, torch.nn.Module): + continue + + _, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook) + self._all_hooks.append(hook) + + # CPU offload models that are not in the seq chain unless they are explicitly excluded + # these models will stay on CPU until maybe_free_model_hooks is called + # some models cannot be in the seq chain because they are iteratively called, such as controlnet + for name, model in all_model_components.items(): + if not isinstance(model, torch.nn.Module): + continue + + if name in self._exclude_from_cpu_offload: + model.to(device) + else: + _, hook = cpu_offload_with_hook(model, device) + self._all_hooks.append(hook) + + def maybe_free_model_hooks(self): + r""" + Function that offloads all components, removes all model hooks that were added when using + `enable_model_cpu_offload` and then applies them again. In case the model has not been offloaded this function + is a no-op. Make sure to add this function to the end of the `__call__` function of your pipeline so that it + functions correctly when applying enable_model_cpu_offload. + """ + if not hasattr(self, "_all_hooks") or len(self._all_hooks) == 0: + # `enable_model_cpu_offload` has not be called, so silently do nothing + return + + for hook in self._all_hooks: + # offload model and remove hook from model + hook.offload() + hook.remove() + + # make sure the model is in the same state as before calling it + self.enable_model_cpu_offload() + + @synchronize_timer('Encode cond') + def encode_cond(self, image, additional_cond_inputs, do_classifier_free_guidance, dual_guidance): + bsz = image.shape[0] + cond = self.conditioner(image=image, **additional_cond_inputs) + + if do_classifier_free_guidance: + un_cond = self.conditioner.unconditional_embedding(bsz, **additional_cond_inputs) + + if dual_guidance: + un_cond_drop_main = copy.deepcopy(un_cond) + un_cond_drop_main['additional'] = cond['additional'] + + def cat_recursive(a, b, c): + if isinstance(a, torch.Tensor): + return torch.cat([a, b, c], dim=0).to(self.dtype) + out = {} + for k in a.keys(): + out[k] = cat_recursive(a[k], b[k], c[k]) + return out + + cond = cat_recursive(cond, un_cond_drop_main, un_cond) + else: + def cat_recursive(a, b): + if isinstance(a, torch.Tensor): + return torch.cat([a, b], dim=0).to(self.dtype) + out = {} + for k in a.keys(): + out[k] = cat_recursive(a[k], b[k]) + return out + + cond = cat_recursive(cond, un_cond) + return cond + + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def prepare_latents(self, batch_size, dtype, device, generator, latents=None): + shape = (batch_size, *self.vae.latent_shape) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0) + return latents + + def prepare_image(self, image) -> dict: + if isinstance(image, str) and not os.path.exists(image): + raise FileNotFoundError(f"Couldn't find image at path {image}") + + if not isinstance(image, list): + image = [image] + + outputs = [] + for img in image: + output = self.image_processor(img) + outputs.append(output) + + cond_input = {k: [] for k in outputs[0].keys()} + for output in outputs: + for key, value in output.items(): + cond_input[key].append(value) + for key, value in cond_input.items(): + if isinstance(value[0], torch.Tensor): + cond_input[key] = torch.cat(value, dim=0) + + return cond_input + + def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): + """ + See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 + + Args: + timesteps (`torch.Tensor`): + generate embedding vectors at these timesteps + embedding_dim (`int`, *optional*, defaults to 512): + dimension of the embeddings to generate + dtype: + data type of the generated embeddings + + Returns: + `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + """ + assert len(w.shape) == 1 + w = w * 1000.0 + + half_dim = embedding_dim // 2 + emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) + emb = w.to(dtype)[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0, 1)) + assert emb.shape == (w.shape[0], embedding_dim) + return emb + + def set_surface_extractor(self, mc_algo): + if mc_algo is None: + return + logger.info('The parameters `mc_algo` is deprecated, and will be removed in future versions.\n' + 'Please use: \n' + 'from hy3dgen.shapegen.models.autoencoders import SurfaceExtractors\n' + 'pipeline.vae.surface_extractor = SurfaceExtractors[mc_algo]() instead\n') + if mc_algo not in SurfaceExtractors.keys(): + raise ValueError(f"Unknown mc_algo {mc_algo}") + self.vae.surface_extractor = SurfaceExtractors[mc_algo]() + + @torch.no_grad() + def __call__( + self, + image: Union[str, List[str], Image.Image] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + sigmas: List[float] = None, + eta: float = 0.0, + guidance_scale: float = 7.5, + dual_guidance_scale: float = 10.5, + dual_guidance: bool = True, + generator=None, + box_v=1.01, + octree_resolution=384, + mc_level=-1 / 512, + num_chunks=8000, + mc_algo=None, + output_type: Optional[str] = "trimesh", + enable_pbar=True, + **kwargs, + ) -> List[List[trimesh.Trimesh]]: + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + self.set_surface_extractor(mc_algo) + + device = self.device + dtype = self.dtype + do_classifier_free_guidance = guidance_scale >= 0 and \ + getattr(self.model, 'guidance_cond_proj_dim', None) is None + dual_guidance = dual_guidance_scale >= 0 and dual_guidance + + cond_inputs = self.prepare_image(image) + image = cond_inputs.pop('image') + cond = self.encode_cond( + image=image, + additional_cond_inputs=cond_inputs, + do_classifier_free_guidance=do_classifier_free_guidance, + dual_guidance=False, + ) + batch_size = image.shape[0] + + t_dtype = torch.long + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas) + + latents = self.prepare_latents(batch_size, dtype, device, generator) + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + guidance_cond = None + if getattr(self.model, 'guidance_cond_proj_dim', None) is not None: + logger.info('Using lcm guidance scale') + guidance_scale_tensor = torch.tensor(guidance_scale - 1).repeat(batch_size) + guidance_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.model.guidance_cond_proj_dim + ).to(device=device, dtype=latents.dtype) + with synchronize_timer('Diffusion Sampling'): + for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:", leave=False)): + # expand the latents if we are doing classifier free guidance + if do_classifier_free_guidance: + latent_model_input = torch.cat([latents] * (3 if dual_guidance else 2)) + else: + latent_model_input = latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + timestep_tensor = torch.tensor([t], dtype=t_dtype, device=device) + timestep_tensor = timestep_tensor.expand(latent_model_input.shape[0]) + noise_pred = self.model(latent_model_input, timestep_tensor, cond, guidance_cond=guidance_cond) + + # no drop, drop clip, all drop + if do_classifier_free_guidance: + if dual_guidance: + noise_pred_clip, noise_pred_dino, noise_pred_uncond = noise_pred.chunk(3) + noise_pred = ( + noise_pred_uncond + + guidance_scale * (noise_pred_clip - noise_pred_dino) + + dual_guidance_scale * (noise_pred_dino - noise_pred_uncond) + ) + else: + noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + outputs = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs) + latents = outputs.prev_sample + + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, outputs) + + return self._export( + latents, + output_type, + box_v, mc_level, num_chunks, octree_resolution, mc_algo, + ) + + def _export( + self, + latents, + output_type='trimesh', + box_v=1.01, + mc_level=0.0, + num_chunks=20000, + octree_resolution=256, + mc_algo='mc', + enable_pbar=True + ): + if not output_type == "latent": + latents = 1. / self.vae.scale_factor * latents + latents = self.vae(latents) + outputs = self.vae.latents2mesh( + latents, + bounds=box_v, + mc_level=mc_level, + num_chunks=num_chunks, + octree_resolution=octree_resolution, + mc_algo=mc_algo, + enable_pbar=enable_pbar, + ) + else: + outputs = latents + + if output_type == 'trimesh': + outputs = export_to_trimesh(outputs) + + return outputs + + +class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline): + + @torch.inference_mode() + def __call__( + self, + image: Union[str, List[str], Image.Image, dict, List[dict]] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + sigmas: List[float] = None, + eta: float = 0.0, + guidance_scale: float = 5.0, + generator=None, + box_v=1.01, + octree_resolution=384, + mc_level=0.0, + mc_algo=None, + num_chunks=8000, + output_type: Optional[str] = "trimesh", + enable_pbar=True, + **kwargs, + ) -> List[List[trimesh.Trimesh]]: + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + self.set_surface_extractor(mc_algo) + + device = self.device + dtype = self.dtype + do_classifier_free_guidance = guidance_scale >= 0 and not ( + hasattr(self.model, 'guidance_embed') and + self.model.guidance_embed is True + ) + + cond_inputs = self.prepare_image(image) + image = cond_inputs.pop('image') + cond = self.encode_cond( + image=image, + additional_cond_inputs=cond_inputs, + do_classifier_free_guidance=do_classifier_free_guidance, + dual_guidance=False, + ) + batch_size = image.shape[0] + + # 5. Prepare timesteps + # NOTE: this is slightly different from common usage, we start from 0. + sigmas = np.linspace(0, 1, num_inference_steps) if sigmas is None else sigmas + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, + num_inference_steps, + device, + sigmas=sigmas, + ) + latents = self.prepare_latents(batch_size, dtype, device, generator) + + guidance = None + if hasattr(self.model, 'guidance_embed') and \ + self.model.guidance_embed is True: + guidance = torch.tensor([guidance_scale] * batch_size, device=device, dtype=dtype) + # logger.info(f'Using guidance embed with scale {guidance_scale}') + + with synchronize_timer('Diffusion Sampling'): + for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:")): + # expand the latents if we are doing classifier free guidance + if do_classifier_free_guidance: + latent_model_input = torch.cat([latents] * 2) + else: + latent_model_input = latents + + # NOTE: we assume model get timesteps ranged from 0 to 1 + timestep = t.expand(latent_model_input.shape[0]).to( + latents.dtype) / self.scheduler.config.num_train_timesteps + noise_pred = self.model(latent_model_input, timestep, cond, guidance=guidance) + + if do_classifier_free_guidance: + noise_pred_cond, noise_pred_uncond = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + outputs = self.scheduler.step(noise_pred, t, latents) + latents = outputs.prev_sample + + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, outputs) + + return self._export( + latents, + output_type, + box_v, mc_level, num_chunks, octree_resolution, mc_algo, + enable_pbar=enable_pbar, + ) diff --git a/hy3dgen/shapegen/postprocessors.py b/hy3dgen/shapegen/postprocessors.py new file mode 100644 index 0000000..d258369 --- /dev/null +++ b/hy3dgen/shapegen/postprocessors.py @@ -0,0 +1,202 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import os +import tempfile +from typing import Union + +import numpy as np +import pymeshlab +import torch +import trimesh + +from .models.autoencoders import Latent2MeshOutput +from .utils import synchronize_timer + + +def load_mesh(path): + if path.endswith(".glb"): + mesh = trimesh.load(path) + else: + mesh = pymeshlab.MeshSet() + mesh.load_new_mesh(path) + return mesh + + +def reduce_face(mesh: pymeshlab.MeshSet, max_facenum: int = 200000): + if max_facenum > mesh.current_mesh().face_number(): + return mesh + + mesh.apply_filter( + "meshing_decimation_quadric_edge_collapse", + targetfacenum=max_facenum, + qualitythr=1.0, + preserveboundary=True, + boundaryweight=3, + preservenormal=True, + preservetopology=True, + autoclean=True + ) + return mesh + + +def remove_floater(mesh: pymeshlab.MeshSet): + mesh.apply_filter("compute_selection_by_small_disconnected_components_per_face", + nbfaceratio=0.005) + mesh.apply_filter("compute_selection_transfer_face_to_vertex", inclusive=False) + mesh.apply_filter("meshing_remove_selected_vertices_and_faces") + return mesh + + +def pymeshlab2trimesh(mesh: pymeshlab.MeshSet): + with tempfile.NamedTemporaryFile(suffix='.ply', delete=False) as temp_file: + mesh.save_current_mesh(temp_file.name) + mesh = trimesh.load(temp_file.name) + # 检查加载的对象类型 + if isinstance(mesh, trimesh.Scene): + combined_mesh = trimesh.Trimesh() + # 如果是Scene,遍历所有的geometry并合并 + for geom in mesh.geometry.values(): + combined_mesh = trimesh.util.concatenate([combined_mesh, geom]) + mesh = combined_mesh + return mesh + + +def trimesh2pymeshlab(mesh: trimesh.Trimesh): + with tempfile.NamedTemporaryFile(suffix='.ply', delete=False) as temp_file: + if isinstance(mesh, trimesh.scene.Scene): + for idx, obj in enumerate(mesh.geometry.values()): + if idx == 0: + temp_mesh = obj + else: + temp_mesh = temp_mesh + obj + mesh = temp_mesh + mesh.export(temp_file.name) + mesh = pymeshlab.MeshSet() + mesh.load_new_mesh(temp_file.name) + return mesh + + +def export_mesh(input, output): + if isinstance(input, pymeshlab.MeshSet): + mesh = output + elif isinstance(input, Latent2MeshOutput): + output = Latent2MeshOutput() + output.mesh_v = output.current_mesh().vertex_matrix() + output.mesh_f = output.current_mesh().face_matrix() + mesh = output + else: + mesh = pymeshlab2trimesh(output) + return mesh + + +def import_mesh(mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str]) -> pymeshlab.MeshSet: + if isinstance(mesh, str): + mesh = load_mesh(mesh) + elif isinstance(mesh, Latent2MeshOutput): + mesh = pymeshlab.MeshSet() + mesh_pymeshlab = pymeshlab.Mesh(vertex_matrix=mesh.mesh_v, face_matrix=mesh.mesh_f) + mesh.add_mesh(mesh_pymeshlab, "converted_mesh") + + if isinstance(mesh, (trimesh.Trimesh, trimesh.scene.Scene)): + mesh = trimesh2pymeshlab(mesh) + + return mesh + + +class FaceReducer: + @synchronize_timer('FaceReducer') + def __call__( + self, + mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str], + max_facenum: int = 40000 + ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh]: + ms = import_mesh(mesh) + ms = reduce_face(ms, max_facenum=max_facenum) + mesh = export_mesh(mesh, ms) + return mesh + + +class FloaterRemover: + @synchronize_timer('FloaterRemover') + def __call__( + self, + mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str], + ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]: + ms = import_mesh(mesh) + ms = remove_floater(ms) + mesh = export_mesh(mesh, ms) + return mesh + + +class DegenerateFaceRemover: + @synchronize_timer('DegenerateFaceRemover') + def __call__( + self, + mesh: Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput, str], + ) -> Union[pymeshlab.MeshSet, trimesh.Trimesh, Latent2MeshOutput]: + ms = import_mesh(mesh) + + with tempfile.NamedTemporaryFile(suffix='.ply', delete=False) as temp_file: + ms.save_current_mesh(temp_file.name) + ms = pymeshlab.MeshSet() + ms.load_new_mesh(temp_file.name) + + mesh = export_mesh(mesh, ms) + return mesh + + +def mesh_normalize(mesh): + """ + Normalize mesh vertices to sphere + """ + scale_factor = 1.2 + vtx_pos = np.asarray(mesh.vertices) + max_bb = (vtx_pos - 0).max(0)[0] + min_bb = (vtx_pos - 0).min(0)[0] + + center = (max_bb + min_bb) / 2 + + scale = torch.norm(torch.tensor(vtx_pos - center, dtype=torch.float32), dim=1).max() * 2.0 + + vtx_pos = (vtx_pos - center) * (scale_factor / float(scale)) + mesh.vertices = vtx_pos + + return mesh + + +class MeshSimplifier: + def __init__(self, executable: str = None): + if executable is None: + CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) + executable = os.path.join(CURRENT_DIR, "mesh_simplifier.bin") + self.executable = executable + + @synchronize_timer('MeshSimplifier') + def __call__( + self, + mesh: Union[trimesh.Trimesh], + ) -> Union[trimesh.Trimesh]: + with tempfile.NamedTemporaryFile(suffix='.obj', delete=False) as temp_input: + with tempfile.NamedTemporaryFile(suffix='.obj', delete=False) as temp_output: + mesh.export(temp_input.name) + os.system(f'{self.executable} {temp_input.name} {temp_output.name}') + ms = trimesh.load(temp_output.name, process=False) + if isinstance(ms, trimesh.Scene): + combined_mesh = trimesh.Trimesh() + for geom in ms.geometry.values(): + combined_mesh = trimesh.util.concatenate([combined_mesh, geom]) + ms = combined_mesh + ms = mesh_normalize(ms) + return ms diff --git a/hy3dgen/shapegen/preprocessors.py b/hy3dgen/shapegen/preprocessors.py new file mode 100644 index 0000000..8a9cb9e --- /dev/null +++ b/hy3dgen/shapegen/preprocessors.py @@ -0,0 +1,167 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import cv2 +import numpy as np +import torch +from PIL import Image +from einops import repeat, rearrange + + +def array_to_tensor(np_array): + image_pt = torch.tensor(np_array).float() + image_pt = image_pt / 255 * 2 - 1 + image_pt = rearrange(image_pt, "h w c -> c h w") + image_pts = repeat(image_pt, "c h w -> b c h w", b=1) + return image_pts + + +class ImageProcessorV2: + def __init__(self, size=512, border_ratio=None): + self.size = size + self.border_ratio = border_ratio + + @staticmethod + def recenter(image, border_ratio: float = 0.2): + """ recenter an image to leave some empty space at the image border. + + Args: + image (ndarray): input image, float/uint8 [H, W, 3/4] + mask (ndarray): alpha mask, bool [H, W] + border_ratio (float, optional): border ratio, image will be resized to (1 - border_ratio). Defaults to 0.2. + + Returns: + ndarray: output image, float/uint8 [H, W, 3/4] + """ + + if image.shape[-1] == 4: + mask = image[..., 3] + else: + mask = np.ones_like(image[..., 0:1]) * 255 + image = np.concatenate([image, mask], axis=-1) + mask = mask[..., 0] + + H, W, C = image.shape + + size = max(H, W) + result = np.zeros((size, size, C), dtype=np.uint8) + + coords = np.nonzero(mask) + x_min, x_max = coords[0].min(), coords[0].max() + y_min, y_max = coords[1].min(), coords[1].max() + h = x_max - x_min + w = y_max - y_min + if h == 0 or w == 0: + raise ValueError('input image is empty') + desired_size = int(size * (1 - border_ratio)) + scale = desired_size / max(h, w) + h2 = int(h * scale) + w2 = int(w * scale) + x2_min = (size - h2) // 2 + x2_max = x2_min + h2 + + y2_min = (size - w2) // 2 + y2_max = y2_min + w2 + + result[x2_min:x2_max, y2_min:y2_max] = cv2.resize(image[x_min:x_max, y_min:y_max], (w2, h2), + interpolation=cv2.INTER_AREA) + + bg = np.ones((result.shape[0], result.shape[1], 3), dtype=np.uint8) * 255 + + mask = result[..., 3:].astype(np.float32) / 255 + result = result[..., :3] * mask + bg * (1 - mask) + + mask = mask * 255 + result = result.clip(0, 255).astype(np.uint8) + mask = mask.clip(0, 255).astype(np.uint8) + return result, mask + + def load_image(self, image, border_ratio=0.15, to_tensor=True): + if isinstance(image, str): + image = cv2.imread(image, cv2.IMREAD_UNCHANGED) + image, mask = self.recenter(image, border_ratio=border_ratio) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + elif isinstance(image, Image.Image): + image = image.convert("RGBA") + image = np.asarray(image) + image, mask = self.recenter(image, border_ratio=border_ratio) + + image = cv2.resize(image, (self.size, self.size), interpolation=cv2.INTER_CUBIC) + mask = cv2.resize(mask, (self.size, self.size), interpolation=cv2.INTER_NEAREST) + mask = mask[..., np.newaxis] + + if to_tensor: + image = array_to_tensor(image) + mask = array_to_tensor(mask) + return image, mask + + def __call__(self, image, border_ratio=0.15, to_tensor=True, **kwargs): + if self.border_ratio is not None: + border_ratio = self.border_ratio + image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor) + outputs = { + 'image': image, + 'mask': mask + } + return outputs + + +class MVImageProcessorV2(ImageProcessorV2): + """ + view order: front, front clockwise 90, back, front clockwise 270 + """ + return_view_idx = True + + def __init__(self, size=512, border_ratio=None): + super().__init__(size, border_ratio) + self.view2idx = { + 'front': 0, + 'left': 1, + 'back': 2, + 'right': 3 + } + + def __call__(self, image_dict, border_ratio=0.15, to_tensor=True, **kwargs): + if self.border_ratio is not None: + border_ratio = self.border_ratio + + images = [] + masks = [] + view_idxs = [] + for idx, (view_tag, image) in enumerate(image_dict.items()): + view_idxs.append(self.view2idx[view_tag]) + image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor) + images.append(image) + masks.append(mask) + + zipped_lists = zip(view_idxs, images, masks) + sorted_zipped_lists = sorted(zipped_lists) + view_idxs, images, masks = zip(*sorted_zipped_lists) + + image = torch.cat(images, 0).unsqueeze(0) + mask = torch.cat(masks, 0).unsqueeze(0) + outputs = { + 'image': image, + 'mask': mask, + 'view_idxs': view_idxs + } + return outputs + + +IMAGE_PROCESSORS = { + "v2": ImageProcessorV2, + 'mv_v2': MVImageProcessorV2, +} + +DEFAULT_IMAGEPROCESSOR = 'v2' diff --git a/hy3dgen/shapegen/schedulers.py b/hy3dgen/shapegen/schedulers.py new file mode 100644 index 0000000..13f0da8 --- /dev/null +++ b/hy3dgen/shapegen/schedulers.py @@ -0,0 +1,480 @@ +# Copyright 2024 Stability AI, Katherine Crowson and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.schedulers.scheduling_utils import SchedulerMixin +from diffusers.utils import BaseOutput, logging + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class FlowMatchEulerDiscreteSchedulerOutput(BaseOutput): + """ + Output class for the scheduler's `step` function output. + + Args: + prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the + denoising loop. + """ + + prev_sample: torch.FloatTensor + + +class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin): + """ + NOTE: this is very similar to diffusers.FlowMatchEulerDiscreteScheduler. Except our timesteps are reversed + + Euler scheduler. + + This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic + methods the library implements for all schedulers such as loading and saving. + + Args: + num_train_timesteps (`int`, defaults to 1000): + The number of diffusion steps to train the model. + timestep_spacing (`str`, defaults to `"linspace"`): + The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information. + shift (`float`, defaults to 1.0): + The shift value for the timestep schedule. + """ + + _compatibles = [] + order = 1 + + @register_to_config + def __init__( + self, + num_train_timesteps: int = 1000, + shift: float = 1.0, + use_dynamic_shifting=False, + ): + timesteps = np.linspace(1, num_train_timesteps, num_train_timesteps, dtype=np.float32).copy() + timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32) + + sigmas = timesteps / num_train_timesteps + if not use_dynamic_shifting: + # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution + sigmas = shift * sigmas / (1 + (shift - 1) * sigmas) + + self.timesteps = sigmas * num_train_timesteps + + self._step_index = None + self._begin_index = None + + self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication + self.sigma_min = self.sigmas[-1].item() + self.sigma_max = self.sigmas[0].item() + + @property + def step_index(self): + """ + The index counter for current timestep. It will increase 1 after each scheduler step. + """ + return self._step_index + + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + + def scale_noise( + self, + sample: torch.FloatTensor, + timestep: Union[float, torch.FloatTensor], + noise: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + """ + Forward process in flow-matching + + Args: + sample (`torch.FloatTensor`): + The input sample. + timestep (`int`, *optional*): + The current timestep in the diffusion chain. + + Returns: + `torch.FloatTensor`: + A scaled input sample. + """ + # Make sure sigmas and timesteps have the same device and dtype as original_samples + sigmas = self.sigmas.to(device=sample.device, dtype=sample.dtype) + + if sample.device.type == "mps" and torch.is_floating_point(timestep): + # mps does not support float64 + schedule_timesteps = self.timesteps.to(sample.device, dtype=torch.float32) + timestep = timestep.to(sample.device, dtype=torch.float32) + else: + schedule_timesteps = self.timesteps.to(sample.device) + timestep = timestep.to(sample.device) + + # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timestep] + elif self.step_index is not None: + # add_noise is called after first denoising step (for inpainting) + step_indices = [self.step_index] * timestep.shape[0] + else: + # add noise is called before first denoising step to create initial latent(img2img) + step_indices = [self.begin_index] * timestep.shape[0] + + sigma = sigmas[step_indices].flatten() + while len(sigma.shape) < len(sample.shape): + sigma = sigma.unsqueeze(-1) + + sample = sigma * noise + (1.0 - sigma) * sample + + return sample + + def _sigma_to_t(self, sigma): + return sigma * self.config.num_train_timesteps + + def time_shift(self, mu: float, sigma: float, t: torch.Tensor): + return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma) + + def set_timesteps( + self, + num_inference_steps: int = None, + device: Union[str, torch.device] = None, + sigmas: Optional[List[float]] = None, + mu: Optional[float] = None, + ): + """ + Sets the discrete timesteps used for the diffusion chain (to be run before inference). + + Args: + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + """ + + if self.config.use_dynamic_shifting and mu is None: + raise ValueError(" you have a pass a value for `mu` when `use_dynamic_shifting` is set to be `True`") + + if sigmas is None: + self.num_inference_steps = num_inference_steps + timesteps = np.linspace( + self._sigma_to_t(self.sigma_max), self._sigma_to_t(self.sigma_min), num_inference_steps + ) + + sigmas = timesteps / self.config.num_train_timesteps + + if self.config.use_dynamic_shifting: + sigmas = self.time_shift(mu, 1.0, sigmas) + else: + sigmas = self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas) + + sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device) + timesteps = sigmas * self.config.num_train_timesteps + + self.timesteps = timesteps.to(device=device) + self.sigmas = torch.cat([sigmas, torch.ones(1, device=sigmas.device)]) + + self._step_index = None + self._begin_index = None + + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps + + indices = (schedule_timesteps == timestep).nonzero() + + # The sigma index that is taken for the **very** first `step` + # is always the second index (or the last index if there is only 1) + # This way we can ensure we don't accidentally skip a sigma in + # case we start in the middle of the denoising schedule (e.g. for image-to-image) + pos = 1 if len(indices) > 1 else 0 + + return indices[pos].item() + + def _init_step_index(self, timestep): + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index + + def step( + self, + model_output: torch.FloatTensor, + timestep: Union[float, torch.FloatTensor], + sample: torch.FloatTensor, + s_churn: float = 0.0, + s_tmin: float = 0.0, + s_tmax: float = float("inf"), + s_noise: float = 1.0, + generator: Optional[torch.Generator] = None, + return_dict: bool = True, + ) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]: + """ + Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`torch.FloatTensor`): + The direct output from learned diffusion model. + timestep (`float`): + The current discrete timestep in the diffusion chain. + sample (`torch.FloatTensor`): + A current instance of a sample created by the diffusion process. + s_churn (`float`): + s_tmin (`float`): + s_tmax (`float`): + s_noise (`float`, defaults to 1.0): + Scaling factor for noise added to the sample. + generator (`torch.Generator`, *optional*): + A random number generator. + return_dict (`bool`): + Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or + tuple. + + Returns: + [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`: + If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is + returned, otherwise a tuple is returned where the first element is the sample tensor. + """ + + if ( + isinstance(timestep, int) + or isinstance(timestep, torch.IntTensor) + or isinstance(timestep, torch.LongTensor) + ): + raise ValueError( + ( + "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" + " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass" + " one of the `scheduler.timesteps` as a timestep." + ), + ) + + if self.step_index is None: + self._init_step_index(timestep) + + # Upcast to avoid precision issues when computing prev_sample + sample = sample.to(torch.float32) + + sigma = self.sigmas[self.step_index] + sigma_next = self.sigmas[self.step_index + 1] + + prev_sample = sample + (sigma_next - sigma) * model_output + + # Cast sample back to model compatible dtype + prev_sample = prev_sample.to(model_output.dtype) + + # upon completion increase step index by one + self._step_index += 1 + + if not return_dict: + return (prev_sample,) + + return FlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample) + + def __len__(self): + return self.config.num_train_timesteps + + +@dataclass +class ConsistencyFlowMatchEulerDiscreteSchedulerOutput(BaseOutput): + prev_sample: torch.FloatTensor + pred_original_sample: torch.FloatTensor + + +class ConsistencyFlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin): + _compatibles = [] + order = 1 + + @register_to_config + def __init__( + self, + num_train_timesteps: int = 1000, + pcm_timesteps: int = 50, + ): + sigmas = np.linspace(0, 1, num_train_timesteps) + step_ratio = num_train_timesteps // pcm_timesteps + + euler_timesteps = (np.arange(1, pcm_timesteps) * step_ratio).round().astype(np.int64) - 1 + euler_timesteps = np.asarray([0] + euler_timesteps.tolist()) + + self.euler_timesteps = euler_timesteps + self.sigmas = sigmas[self.euler_timesteps] + self.sigmas = torch.from_numpy((self.sigmas.copy())).to(dtype=torch.float32) + self.timesteps = self.sigmas * num_train_timesteps + self._step_index = None + self._begin_index = None + self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication + + @property + def step_index(self): + """ + The index counter for current timestep. It will increase 1 after each scheduler step. + """ + return self._step_index + + @property + def begin_index(self): + """ + The index for the first timestep. It should be set from pipeline with `set_begin_index` method. + """ + return self._begin_index + + # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index + def set_begin_index(self, begin_index: int = 0): + """ + Sets the begin index for the scheduler. This function should be run from pipeline before the inference. + + Args: + begin_index (`int`): + The begin index for the scheduler. + """ + self._begin_index = begin_index + + def _sigma_to_t(self, sigma): + return sigma * self.config.num_train_timesteps + + def set_timesteps( + self, + num_inference_steps: int = None, + device: Union[str, torch.device] = None, + sigmas: Optional[List[float]] = None, + ): + """ + Sets the discrete timesteps used for the diffusion chain (to be run before inference). + + Args: + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + """ + self.num_inference_steps = num_inference_steps if num_inference_steps is not None else len(sigmas) + inference_indices = np.linspace( + 0, self.config.pcm_timesteps, num=self.num_inference_steps, endpoint=False + ) + inference_indices = np.floor(inference_indices).astype(np.int64) + inference_indices = torch.from_numpy(inference_indices).long() + + self.sigmas_ = self.sigmas[inference_indices] + timesteps = self.sigmas_ * self.config.num_train_timesteps + self.timesteps = timesteps.to(device=device) + self.sigmas_ = torch.cat( + [self.sigmas_, torch.ones(1, device=self.sigmas_.device)] + ) + + self._step_index = None + self._begin_index = None + + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps + + indices = (schedule_timesteps == timestep).nonzero() + + # The sigma index that is taken for the **very** first `step` + # is always the second index (or the last index if there is only 1) + # This way we can ensure we don't accidentally skip a sigma in + # case we start in the middle of the denoising schedule (e.g. for image-to-image) + pos = 1 if len(indices) > 1 else 0 + + return indices[pos].item() + + def _init_step_index(self, timestep): + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index + + def step( + self, + model_output: torch.FloatTensor, + timestep: Union[float, torch.FloatTensor], + sample: torch.FloatTensor, + generator: Optional[torch.Generator] = None, + return_dict: bool = True, + ) -> Union[ConsistencyFlowMatchEulerDiscreteSchedulerOutput, Tuple]: + if ( + isinstance(timestep, int) + or isinstance(timestep, torch.IntTensor) + or isinstance(timestep, torch.LongTensor) + ): + raise ValueError( + ( + "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" + " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass" + " one of the `scheduler.timesteps` as a timestep." + ), + ) + + if self.step_index is None: + self._init_step_index(timestep) + + sample = sample.to(torch.float32) + + sigma = self.sigmas_[self.step_index] + sigma_next = self.sigmas_[self.step_index + 1] + + prev_sample = sample + (sigma_next - sigma) * model_output + prev_sample = prev_sample.to(model_output.dtype) + + pred_original_sample = sample + (1.0 - sigma) * model_output + pred_original_sample = pred_original_sample.to(model_output.dtype) + + self._step_index += 1 + + if not return_dict: + return (prev_sample,) + + return ConsistencyFlowMatchEulerDiscreteSchedulerOutput(prev_sample=prev_sample, + pred_original_sample=pred_original_sample) + + def __len__(self): + return self.config.num_train_timesteps diff --git a/hy3dgen/shapegen/utils.py b/hy3dgen/shapegen/utils.py new file mode 100644 index 0000000..6ac8f5d --- /dev/null +++ b/hy3dgen/shapegen/utils.py @@ -0,0 +1,126 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import logging +import os +from functools import wraps + +import torch + + +def get_logger(name): + logger = logging.getLogger(name) + logger.setLevel(logging.INFO) + + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) + + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + return logger + + +logger = get_logger('hy3dgen.shapgen') + + +class synchronize_timer: + """ Synchronized timer to count the inference time of `nn.Module.forward`. + + Supports both context manager and decorator usage. + + Example as context manager: + ```python + with synchronize_timer('name') as t: + run() + ``` + + Example as decorator: + ```python + @synchronize_timer('Export to trimesh') + def export_to_trimesh(mesh_output): + pass + ``` + """ + + def __init__(self, name=None): + self.name = name + + def __enter__(self): + """Context manager entry: start timing.""" + if os.environ.get('HY3DGEN_DEBUG', '0') == '1': + self.start = torch.cuda.Event(enable_timing=True) + self.end = torch.cuda.Event(enable_timing=True) + self.start.record() + return lambda: self.time + + def __exit__(self, exc_type, exc_value, exc_tb): + """Context manager exit: stop timing and log results.""" + if os.environ.get('HY3DGEN_DEBUG', '0') == '1': + self.end.record() + torch.cuda.synchronize() + self.time = self.start.elapsed_time(self.end) + if self.name is not None: + logger.info(f'{self.name} takes {self.time} ms') + + def __call__(self, func): + """Decorator: wrap the function to time its execution.""" + + @wraps(func) + def wrapper(*args, **kwargs): + with self: + result = func(*args, **kwargs) + return result + + return wrapper + + +def smart_load_model( + model_path, + subfolder, + use_safetensors, + variant, +): + original_model_path = model_path + # try local path + base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen') + model_path = os.path.expanduser(os.path.join(base_dir, model_path, subfolder)) + logger.info(f'Try to load model from local path: {model_path}') + if not os.path.exists(model_path): + logger.info('Model path not exists, try to download from huggingface') + try: + from huggingface_hub import snapshot_download + # 只下载指定子目录 + path = snapshot_download( + repo_id=original_model_path, + allow_patterns=[f"{subfolder}/*"], # 关键修改:模式匹配子文件夹 + ) + model_path = os.path.join(path, subfolder) # 保持路径拼接逻辑不变 + except ImportError: + logger.warning( + "You need to install HuggingFace Hub to load models from the hub." + ) + raise RuntimeError(f"Model path {model_path} not found") + except Exception as e: + raise e + + if not os.path.exists(model_path): + raise FileNotFoundError(f"Model path {original_model_path} not found") + + extension = 'ckpt' if not use_safetensors else 'safetensors' + variant = '' if variant is None else f'.{variant}' + ckpt_name = f'model{variant}.{extension}' + config_path = os.path.join(model_path, 'config.yaml') + ckpt_path = os.path.join(model_path, ckpt_name) + return config_path, ckpt_path diff --git a/hy3dgen/texgen/__init__.py b/hy3dgen/texgen/__init__.py new file mode 100644 index 0000000..7054c57 --- /dev/null +++ b/hy3dgen/texgen/__init__.py @@ -0,0 +1,16 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +from .pipelines import Hunyuan3DPaintPipeline, Hunyuan3DTexGenConfig diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/__init__.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/__init__.py new file mode 100644 index 0000000..f471e1a --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/__init__.py @@ -0,0 +1,22 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +''' +from .hierarchy import BuildHierarchy, BuildHierarchyWithColor +from .io_obj import LoadObj, LoadObjWithTexture +from .render import rasterize, interpolate +''' +from .io_glb import * +from .io_obj import * +from .render import * diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_glb.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_glb.py new file mode 100644 index 0000000..f1daf7f --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_glb.py @@ -0,0 +1,241 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import base64 +import io +import os + +import numpy as np +from PIL import Image as PILImage +from pygltflib import GLTF2 +from scipy.spatial.transform import Rotation as R + + +# Function to extract buffer data +def get_buffer_data(gltf, buffer_view): + buffer = gltf.buffers[buffer_view.buffer] + buffer_data = gltf.get_data_from_buffer_uri(buffer.uri) + byte_offset = buffer_view.byteOffset if buffer_view.byteOffset else 0 + byte_length = buffer_view.byteLength + return buffer_data[byte_offset:byte_offset + byte_length] + + +# Function to extract attribute data +def get_attribute_data(gltf, accessor_index): + accessor = gltf.accessors[accessor_index] + buffer_view = gltf.bufferViews[accessor.bufferView] + buffer_data = get_buffer_data(gltf, buffer_view) + + comptype = {5120: np.int8, 5121: np.uint8, 5122: np.int16, 5123: np.uint16, 5125: np.uint32, 5126: np.float32} + dtype = comptype[accessor.componentType] + + t2n = {'SCALAR': 1, 'VEC2': 2, 'VEC3': 3, 'VEC4': 4, 'MAT2': 4, 'MAT3': 9, 'MAT4': 16} + num_components = t2n[accessor.type] + + # Calculate the correct slice of data + byte_offset = accessor.byteOffset if accessor.byteOffset else 0 + byte_stride = buffer_view.byteStride if buffer_view.byteStride else num_components * np.dtype(dtype).itemsize + count = accessor.count + + # Extract the attribute data + attribute_data = np.zeros((count, num_components), dtype=dtype) + for i in range(count): + start = byte_offset + i * byte_stride + end = start + num_components * np.dtype(dtype).itemsize + attribute_data[i] = np.frombuffer(buffer_data[start:end], dtype=dtype) + + return attribute_data + + +# Function to extract image data +def get_image_data(gltf, image, folder): + if image.uri: + if image.uri.startswith('data:'): + # Data URI + header, encoded = image.uri.split(',', 1) + data = base64.b64decode(encoded) + else: + # External file + fn = image.uri + if not os.path.isabs(fn): + fn = folder + '/' + fn + with open(fn, 'rb') as f: + data = f.read() + else: + buffer_view = gltf.bufferViews[image.bufferView] + data = get_buffer_data(gltf, buffer_view) + return data + + +# Function to convert triangle strip to triangles +def convert_triangle_strip_to_triangles(indices): + triangles = [] + for i in range(len(indices) - 2): + if i % 2 == 0: + triangles.append([indices[i], indices[i + 1], indices[i + 2]]) + else: + triangles.append([indices[i], indices[i + 2], indices[i + 1]]) + return np.array(triangles).reshape(-1, 3) + + +# Function to convert triangle fan to triangles +def convert_triangle_fan_to_triangles(indices): + triangles = [] + for i in range(1, len(indices) - 1): + triangles.append([indices[0], indices[i], indices[i + 1]]) + return np.array(triangles).reshape(-1, 3) + + +# Function to get the transformation matrix from a node +def get_node_transform(node): + if node.matrix: + return np.array(node.matrix).reshape(4, 4).T + else: + T = np.eye(4) + if node.translation: + T[:3, 3] = node.translation + if node.rotation: + R_mat = R.from_quat(node.rotation).as_matrix() + T[:3, :3] = R_mat + if node.scale: + S = np.diag(node.scale + [1]) + T = T @ S + return T + + +def get_world_transform(gltf, node_index, parents, world_transforms): + if parents[node_index] == -2: + return world_transforms[node_index] + + node = gltf.nodes[node_index] + if parents[node_index] == -1: + world_transforms[node_index] = get_node_transform(node) + parents[node_index] = -2 + return world_transforms[node_index] + + parent_index = parents[node_index] + parent_transform = get_world_transform(gltf, parent_index, parents, world_transforms) + world_transforms[node_index] = parent_transform @ get_node_transform(node) + parents[node_index] = -2 + return world_transforms[node_index] + + +def LoadGlb(path): + # Load the GLB file using pygltflib + gltf = GLTF2().load(path) + + primitives = [] + images = {} + # Iterate through the meshes in the GLB file + + world_transforms = [np.identity(4) for i in range(len(gltf.nodes))] + parents = [-1 for i in range(len(gltf.nodes))] + for node_index, node in enumerate(gltf.nodes): + for idx in node.children: + parents[idx] = node_index + # for i in range(len(gltf.nodes)): + # get_world_transform(gltf, i, parents, world_transform) + + for node_index, node in enumerate(gltf.nodes): + if node.mesh is not None: + world_transform = get_world_transform(gltf, node_index, parents, world_transforms) + # Iterate through the primitives in the mesh + mesh = gltf.meshes[node.mesh] + for primitive in mesh.primitives: + # Access the attributes of the primitive + attributes = primitive.attributes.__dict__ + mode = primitive.mode if primitive.mode is not None else 4 # Default to TRIANGLES + result = {} + if primitive.indices is not None: + indices = get_attribute_data(gltf, primitive.indices) + if mode == 4: # TRIANGLES + face_indices = indices.reshape(-1, 3) + elif mode == 5: # TRIANGLE_STRIP + face_indices = convert_triangle_strip_to_triangles(indices) + elif mode == 6: # TRIANGLE_FAN + face_indices = convert_triangle_fan_to_triangles(indices) + else: + continue + result['F'] = face_indices + + # Extract vertex positions + if 'POSITION' in attributes and attributes['POSITION'] is not None: + positions = get_attribute_data(gltf, attributes['POSITION']) + # Apply the world transformation to the positions + positions_homogeneous = np.hstack([positions, np.ones((positions.shape[0], 1))]) + transformed_positions = (world_transform @ positions_homogeneous.T).T[:, :3] + result['V'] = transformed_positions + + # Extract vertex colors + if 'COLOR_0' in attributes and attributes['COLOR_0'] is not None: + colors = get_attribute_data(gltf, attributes['COLOR_0']) + if colors.shape[-1] > 3: + colors = colors[..., :3] + result['VC'] = colors + + # Extract UVs + if 'TEXCOORD_0' in attributes and not attributes['TEXCOORD_0'] is None: + uvs = get_attribute_data(gltf, attributes['TEXCOORD_0']) + result['UV'] = uvs + + if primitive.material is not None: + material = gltf.materials[primitive.material] + if ( + material.pbrMetallicRoughness is not None + and material.pbrMetallicRoughness.baseColorTexture is not None + ): + texture_index = material.pbrMetallicRoughness.baseColorTexture.index + texture = gltf.textures[texture_index] + image_index = texture.source + if not image_index in images: + image = gltf.images[image_index] + image_data = get_image_data(gltf, image, os.path.dirname(path)) + pil_image = PILImage.open(io.BytesIO(image_data)) + if pil_image.mode != 'RGB': + pil_image = pil_image.convert('RGB') + images[image_index] = pil_image + result['TEX'] = image_index + elif material.emissiveTexture is not None: + texture_index = material.emissiveTexture.index + texture = gltf.textures[texture_index] + image_index = texture.source + if not image_index in images: + image = gltf.images[image_index] + image_data = get_image_data(gltf, image, os.path.dirname(path)) + pil_image = PILImage.open(io.BytesIO(image_data)) + if pil_image.mode != 'RGB': + pil_image = pil_image.convert('RGB') + images[image_index] = pil_image + result['TEX'] = image_index + else: + if material.pbrMetallicRoughness is not None: + base_color = material.pbrMetallicRoughness.baseColorFactor + else: + base_color = np.array([0.8, 0.8, 0.8], dtype=np.float32) + result['MC'] = base_color + + primitives.append(result) + + return primitives, images + + +def RotatePrimitives(primitives, transform): + for i in range(len(primitives)): + if 'V' in primitives[i]: + primitives[i]['V'] = primitives[i]['V'] @ transform.T + + +if __name__ == '__main__': + path = 'data/test.glb' + LoadGlb(path) diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_obj.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_obj.py new file mode 100644 index 0000000..e40d500 --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/io_obj.py @@ -0,0 +1,66 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import cv2 +import numpy as np + + +def LoadObj(fn): + lines = [l.strip() for l in open(fn)] + vertices = [] + faces = [] + for l in lines: + words = [w for w in l.split(' ') if w != ''] + if len(words) == 0: + continue + if words[0] == 'v': + v = [float(words[i]) for i in range(1, 4)] + vertices.append(v) + elif words[0] == 'f': + f = [int(words[i]) - 1 for i in range(1, 4)] + faces.append(f) + + return np.array(vertices).astype('float32'), np.array(faces).astype('int32') + + +def LoadObjWithTexture(fn, tex_fn): + lines = [l.strip() for l in open(fn)] + vertices = [] + vertex_textures = [] + faces = [] + face_textures = [] + for l in lines: + words = [w for w in l.split(' ') if w != ''] + if len(words) == 0: + continue + if words[0] == 'v': + v = [float(words[i]) for i in range(1, len(words))] + vertices.append(v) + elif words[0] == 'vt': + v = [float(words[i]) for i in range(1, len(words))] + vertex_textures.append(v) + elif words[0] == 'f': + f = [] + ft = [] + for i in range(1, len(words)): + t = words[i].split('/') + f.append(int(t[0]) - 1) + ft.append(int(t[1]) - 1) + for i in range(2, len(f)): + faces.append([f[0], f[i - 1], f[i]]) + face_textures.append([ft[0], ft[i - 1], ft[i]]) + + tex_image = cv2.cvtColor(cv2.imread(tex_fn), cv2.COLOR_BGR2RGB) + return np.array(vertices).astype('float32'), np.array(vertex_textures).astype('float32'), np.array(faces).astype( + 'int32'), np.array(face_textures).astype('int32'), tex_image diff --git a/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/render.py b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/render.py new file mode 100644 index 0000000..2d4d3f7 --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/custom_rasterizer/render.py @@ -0,0 +1,31 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import custom_rasterizer_kernel +import torch + + +def rasterize(pos, tri, resolution, clamp_depth=torch.zeros(0), use_depth_prior=0): + assert (pos.device == tri.device) + findices, barycentric = custom_rasterizer_kernel.rasterize_image(pos[0], tri, clamp_depth, resolution[1], + resolution[0], 1e-6, use_depth_prior) + return findices, barycentric + + +def interpolate(col, findices, barycentric, tri): + f = findices - 1 + (findices == 0) + vcol = col[0, tri.long()[f.long()]] + result = barycentric.view(*barycentric.shape, 1) * vcol + result = torch.sum(result, axis=-2) + return result.view(1, *result.shape) diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/__init__.py b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/__init__.py new file mode 100644 index 0000000..1614ff8 --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/__init__.py @@ -0,0 +1,13 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp new file mode 100644 index 0000000..65ab321 --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/grid_neighbor.cpp @@ -0,0 +1,574 @@ +#include "rasterizer.h" +#include + +inline int pos2key(float* p, int resolution) { + int x = (p[0] * 0.5 + 0.5) * resolution; + int y = (p[1] * 0.5 + 0.5) * resolution; + int z = (p[2] * 0.5 + 0.5) * resolution; + return (x * resolution + y) * resolution + z; +} + +inline void key2pos(int key, int resolution, float* p) { + int x = key / resolution / resolution; + int y = key / resolution % resolution; + int z = key % resolution; + p[0] = ((x + 0.5) / resolution - 0.5) * 2; + p[1] = ((y + 0.5) / resolution - 0.5) * 2; + p[2] = ((z + 0.5) / resolution - 0.5) * 2; +} + +inline void key2cornerpos(int key, int resolution, float* p) { + int x = key / resolution / resolution; + int y = key / resolution % resolution; + int z = key % resolution; + p[0] = ((x + 0.75) / resolution - 0.5) * 2; + p[1] = ((y + 0.25) / resolution - 0.5) * 2; + p[2] = ((z + 0.75) / resolution - 0.5) * 2; +} + +inline float* pos_ptr(int l, int i, int j, torch::Tensor t) { + float* pdata = t.data_ptr(); + int height = t.size(1); + int width = t.size(2); + return &pdata[((l * height + i) * width + j) * 4]; +} + +struct Grid +{ + std::vector seq2oddcorner; + std::vector seq2evencorner; + std::vector seq2grid; + std::vector seq2normal; + std::vector seq2neighbor; + std::unordered_map grid2seq; + std::vector downsample_seq; + int num_origin_seq; + int resolution; + int stride; +}; + +inline void pos_from_seq(Grid& grid, int seq, float* p) { + auto k = grid.seq2grid[seq]; + key2pos(k, grid.resolution, p); +} + +inline int fetch_seq(Grid& grid, int l, int i, int j, torch::Tensor pdata) { + float* p = pos_ptr(l, i, j, pdata); + if (p[3] == 0) + return -1; + auto key = pos2key(p, grid.resolution); + int seq = grid.grid2seq[key]; + return seq; +} + +inline int fetch_last_seq(Grid& grid, int i, int j, torch::Tensor pdata) { + int num_layers = pdata.size(0); + int l = 0; + int idx = fetch_seq(grid, l, i, j, pdata); + while (l < num_layers - 1) { + l += 1; + int new_idx = fetch_seq(grid, l, i, j, pdata); + if (new_idx == -1) + break; + idx = new_idx; + } + return idx; +} + +inline int fetch_nearest_seq(Grid& grid, int i, int j, int dim, float d, torch::Tensor pdata) { + float p[3]; + float max_dist = 1e10; + int best_idx = -1; + int num_layers = pdata.size(0); + for (int l = 0; l < num_layers; ++l) { + int idx = fetch_seq(grid, l, i, j, pdata); + if (idx == -1) + break; + pos_from_seq(grid, idx, p); + float dist = std::abs(d - p[(dim + 2) % 3]); + if (dist < max_dist) { + max_dist = dist; + best_idx = idx; + } + } + return best_idx; +} + +inline int fetch_nearest_seq_layer(Grid& grid, int i, int j, int dim, float d, torch::Tensor pdata) { + float p[3]; + float max_dist = 1e10; + int best_layer = -1; + int num_layers = pdata.size(0); + for (int l = 0; l < num_layers; ++l) { + int idx = fetch_seq(grid, l, i, j, pdata); + if (idx == -1) + break; + pos_from_seq(grid, idx, p); + float dist = std::abs(d - p[(dim + 2) % 3]); + if (dist < max_dist) { + max_dist = dist; + best_layer = l; + } + } + return best_layer; +} + +void FetchNeighbor(Grid& grid, int seq, float* pos, int dim, int boundary_info, std::vector& view_layer_positions, + int* output_indices) +{ + auto t = view_layer_positions[dim]; + int height = t.size(1); + int width = t.size(2); + int top = 0; + int ci = 0; + int cj = 0; + if (dim == 0) { + ci = (pos[1]/2+0.5)*height; + cj = (pos[0]/2+0.5)*width; + } + else if (dim == 1) { + ci = (pos[1]/2+0.5)*height; + cj = (pos[2]/2+0.5)*width; + } + else { + ci = (-pos[2]/2+0.5)*height; + cj = (pos[0]/2+0.5)*width; + } + int stride = grid.stride; + for (int ni = ci + stride; ni >= ci - stride; ni -= stride) { + for (int nj = cj - stride; nj <= cj + stride; nj += stride) { + int idx = -1; + if (ni == ci && nj == cj) + idx = seq; + else if (!(ni < 0 || ni >= height || nj < 0 || nj >= width)) { + if (boundary_info == -1) + idx = fetch_seq(grid, 0, ni, nj, t); + else if (boundary_info == 1) + idx = fetch_last_seq(grid, ni, nj, t); + else + idx = fetch_nearest_seq(grid, ni, nj, dim, pos[(dim + 2) % 3], t); + } + output_indices[top] = idx; + top += 1; + } + } +} + +void DownsampleGrid(Grid& src, Grid& tar) +{ + src.downsample_seq.resize(src.seq2grid.size(), -1); + tar.resolution = src.resolution / 2; + tar.stride = src.stride * 2; + float pos[3]; + std::vector seq2normal_count; + for (int i = 0; i < src.seq2grid.size(); ++i) { + key2pos(src.seq2grid[i], src.resolution, pos); + int k = pos2key(pos, tar.resolution); + int s = seq2normal_count.size(); + if (!tar.grid2seq.count(k)) { + tar.grid2seq[k] = tar.seq2grid.size(); + tar.seq2grid.emplace_back(k); + seq2normal_count.emplace_back(0); + seq2normal_count.emplace_back(0); + seq2normal_count.emplace_back(0); + //tar.seq2normal.emplace_back(src.seq2normal[i]); + } else { + s = tar.grid2seq[k] * 3; + } + seq2normal_count[s + src.seq2normal[i]] += 1; + src.downsample_seq[i] = tar.grid2seq[k]; + } + tar.seq2normal.resize(seq2normal_count.size() / 3); + for (int i = 0; i < seq2normal_count.size(); i += 3) { + int t = 0; + for (int j = 1; j < 3; ++j) { + if (seq2normal_count[i + j] > seq2normal_count[i + t]) + t = j; + } + tar.seq2normal[i / 3] = t; + } +} + +void NeighborGrid(Grid& grid, std::vector view_layer_positions, int v) +{ + grid.seq2evencorner.resize(grid.seq2grid.size(), 0); + grid.seq2oddcorner.resize(grid.seq2grid.size(), 0); + std::unordered_set visited_seq; + for (int vd = 0; vd < 3; ++vd) { + auto t = view_layer_positions[vd]; + auto t0 = view_layer_positions[v]; + int height = t.size(1); + int width = t.size(2); + int num_layers = t.size(0); + int num_view_layers = t0.size(0); + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + for (int l = 0; l < num_layers; ++l) { + int seq = fetch_seq(grid, l, i, j, t); + if (seq == -1) + break; + int dim = grid.seq2normal[seq]; + if (dim != v) + continue; + + float pos[3]; + pos_from_seq(grid, seq, pos); + + int ci = 0; + int cj = 0; + if (dim == 0) { + ci = (pos[1]/2+0.5)*height; + cj = (pos[0]/2+0.5)*width; + } + else if (dim == 1) { + ci = (pos[1]/2+0.5)*height; + cj = (pos[2]/2+0.5)*width; + } + else { + ci = (-pos[2]/2+0.5)*height; + cj = (pos[0]/2+0.5)*width; + } + + if ((ci % (grid.stride * 2) < grid.stride) && (cj % (grid.stride * 2) >= grid.stride)) + grid.seq2evencorner[seq] = 1; + + if ((ci % (grid.stride * 2) >= grid.stride) && (cj % (grid.stride * 2) < grid.stride)) + grid.seq2oddcorner[seq] = 1; + + bool is_boundary = false; + if (vd == v) { + if (l == 0 || l == num_layers - 1) + is_boundary = true; + else { + int seq_new = fetch_seq(grid, l + 1, i, j, t); + if (seq_new == -1) + is_boundary = true; + } + } + int boundary_info = 0; + if (is_boundary && (l == 0)) + boundary_info = -1; + else if (is_boundary) + boundary_info = 1; + if (visited_seq.count(seq)) + continue; + visited_seq.insert(seq); + + FetchNeighbor(grid, seq, pos, dim, boundary_info, view_layer_positions, &grid.seq2neighbor[seq * 9]); + } + } + } + } +} + +void PadGrid(Grid& src, Grid& tar, std::vector& view_layer_positions) { + auto& downsample_seq = src.downsample_seq; + auto& seq2evencorner = src.seq2evencorner; + auto& seq2oddcorner = src.seq2oddcorner; + int indices[9]; + std::vector mapped_even_corners(tar.seq2grid.size(), 0); + std::vector mapped_odd_corners(tar.seq2grid.size(), 0); + for (int i = 0; i < downsample_seq.size(); ++i) { + if (seq2evencorner[i] > 0) { + mapped_even_corners[downsample_seq[i]] = 1; + } + if (seq2oddcorner[i] > 0) { + mapped_odd_corners[downsample_seq[i]] = 1; + } + } + auto& tar_seq2normal = tar.seq2normal; + auto& tar_seq2grid = tar.seq2grid; + for (int i = 0; i < tar_seq2grid.size(); ++i) { + if (mapped_even_corners[i] == 1 && mapped_odd_corners[i] == 1) + continue; + auto k = tar_seq2grid[i]; + float p[3]; + key2cornerpos(k, tar.resolution, p); + + int src_key = pos2key(p, src.resolution); + if (!src.grid2seq.count(src_key)) { + int seq = src.seq2grid.size(); + src.grid2seq[src_key] = seq; + src.seq2evencorner.emplace_back((mapped_even_corners[i] == 0)); + src.seq2oddcorner.emplace_back((mapped_odd_corners[i] == 0)); + src.seq2grid.emplace_back(src_key); + src.seq2normal.emplace_back(tar_seq2normal[i]); + FetchNeighbor(src, seq, p, tar_seq2normal[i], 0, view_layer_positions, indices); + for (int j = 0; j < 9; ++j) { + src.seq2neighbor.emplace_back(indices[j]); + } + src.downsample_seq.emplace_back(i); + } else { + int seq = src.grid2seq[src_key]; + if (mapped_even_corners[i] == 0) + src.seq2evencorner[seq] = 1; + if (mapped_odd_corners[i] == 0) + src.seq2oddcorner[seq] = 1; + } + } +} + +std::vector> build_hierarchy(std::vector view_layer_positions, + std::vector view_layer_normals, int num_level, int resolution) +{ + if (view_layer_positions.size() != 3 || num_level < 1) { + printf("Alert! We require 3 layers and at least 1 level! (%d %d)\n", view_layer_positions.size(), num_level); + return {{},{},{},{}}; + } + + std::vector grids; + grids.resize(num_level); + + std::vector seq2pos; + auto& seq2grid = grids[0].seq2grid; + auto& seq2normal = grids[0].seq2normal; + auto& grid2seq = grids[0].grid2seq; + grids[0].resolution = resolution; + grids[0].stride = 1; + + auto int64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false); + auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false); + + for (int v = 0; v < 3; ++v) { + int num_layers = view_layer_positions[v].size(0); + int height = view_layer_positions[v].size(1); + int width = view_layer_positions[v].size(2); + float* data = view_layer_positions[v].data_ptr(); + float* data_normal = view_layer_normals[v].data_ptr(); + for (int l = 0; l < num_layers; ++l) { + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + float* p = &data[(i * width + j) * 4]; + float* n = &data_normal[(i * width + j) * 3]; + if (p[3] == 0) + continue; + auto k = pos2key(p, resolution); + if (!grid2seq.count(k)) { + int dim = 0; + for (int d = 0; d < 3; ++d) { + if (std::abs(n[d]) > std::abs(n[dim])) + dim = d; + } + dim = (dim + 1) % 3; + grid2seq[k] = seq2grid.size(); + seq2grid.emplace_back(k); + seq2pos.push_back(p[0]); + seq2pos.push_back(p[1]); + seq2pos.push_back(p[2]); + seq2normal.emplace_back(dim); + } + } + } + data += (height * width * 4); + data_normal += (height * width * 3); + } + } + + for (int i = 0; i < num_level - 1; ++i) { + DownsampleGrid(grids[i], grids[i + 1]); + } + + for (int l = 0; l < num_level; ++l) { + grids[l].seq2neighbor.resize(grids[l].seq2grid.size() * 9, -1); + grids[l].num_origin_seq = grids[l].seq2grid.size(); + for (int d = 0; d < 3; ++d) { + NeighborGrid(grids[l], view_layer_positions, d); + } + } + + for (int i = num_level - 2; i >= 0; --i) { + PadGrid(grids[i], grids[i + 1], view_layer_positions); + } + for (int i = grids[0].num_origin_seq; i < grids[0].seq2grid.size(); ++i) { + int k = grids[0].seq2grid[i]; + float p[3]; + key2pos(k, grids[0].resolution, p); + seq2pos.push_back(p[0]); + seq2pos.push_back(p[1]); + seq2pos.push_back(p[2]); + } + + std::vector texture_positions(2); + std::vector grid_neighbors(grids.size()); + std::vector grid_downsamples(grids.size() - 1); + std::vector grid_evencorners(grids.size()); + std::vector grid_oddcorners(grids.size()); + + texture_positions[0] = torch::zeros({static_cast(seq2pos.size() / 3), static_cast(3)}, float_options); + texture_positions[1] = torch::zeros({static_cast(seq2pos.size() / 3)}, float_options); + float* positions_out_ptr = texture_positions[0].data_ptr(); + memcpy(positions_out_ptr, seq2pos.data(), sizeof(float) * seq2pos.size()); + positions_out_ptr = texture_positions[1].data_ptr(); + for (int i = 0; i < grids[0].seq2grid.size(); ++i) { + positions_out_ptr[i] = (i < grids[0].num_origin_seq); + } + + for (int i = 0; i < grids.size(); ++i) { + grid_neighbors[i] = torch::zeros({static_cast(grids[i].seq2grid.size()), static_cast(9)}, int64_options); + int64_t* nptr = grid_neighbors[i].data_ptr(); + for (int j = 0; j < grids[i].seq2neighbor.size(); ++j) { + nptr[j] = grids[i].seq2neighbor[j]; + } + + grid_evencorners[i] = torch::zeros({static_cast(grids[i].seq2evencorner.size())}, int64_options); + grid_oddcorners[i] = torch::zeros({static_cast(grids[i].seq2oddcorner.size())}, int64_options); + int64_t* dptr = grid_evencorners[i].data_ptr(); + for (int j = 0; j < grids[i].seq2evencorner.size(); ++j) { + dptr[j] = grids[i].seq2evencorner[j]; + } + dptr = grid_oddcorners[i].data_ptr(); + for (int j = 0; j < grids[i].seq2oddcorner.size(); ++j) { + dptr[j] = grids[i].seq2oddcorner[j]; + } + if (i + 1 < grids.size()) { + grid_downsamples[i] = torch::zeros({static_cast(grids[i].downsample_seq.size())}, int64_options); + int64_t* dptr = grid_downsamples[i].data_ptr(); + for (int j = 0; j < grids[i].downsample_seq.size(); ++j) { + dptr[j] = grids[i].downsample_seq[j]; + } + } + + } + return {texture_positions, grid_neighbors, grid_downsamples, grid_evencorners, grid_oddcorners}; +} + +std::vector> build_hierarchy_with_feat( + std::vector view_layer_positions, + std::vector view_layer_normals, + std::vector view_layer_feats, + int num_level, int resolution) +{ + if (view_layer_positions.size() != 3 || num_level < 1) { + printf("Alert! We require 3 layers and at least 1 level! (%d %d)\n", view_layer_positions.size(), num_level); + return {{},{},{},{}}; + } + + std::vector grids; + grids.resize(num_level); + + std::vector seq2pos; + std::vector seq2feat; + auto& seq2grid = grids[0].seq2grid; + auto& seq2normal = grids[0].seq2normal; + auto& grid2seq = grids[0].grid2seq; + grids[0].resolution = resolution; + grids[0].stride = 1; + + auto int64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false); + auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false); + + int feat_channel = 3; + for (int v = 0; v < 3; ++v) { + int num_layers = view_layer_positions[v].size(0); + int height = view_layer_positions[v].size(1); + int width = view_layer_positions[v].size(2); + float* data = view_layer_positions[v].data_ptr(); + float* data_normal = view_layer_normals[v].data_ptr(); + float* data_feat = view_layer_feats[v].data_ptr(); + feat_channel = view_layer_feats[v].size(3); + for (int l = 0; l < num_layers; ++l) { + for (int i = 0; i < height; ++i) { + for (int j = 0; j < width; ++j) { + float* p = &data[(i * width + j) * 4]; + float* n = &data_normal[(i * width + j) * 3]; + float* f = &data_feat[(i * width + j) * feat_channel]; + if (p[3] == 0) + continue; + auto k = pos2key(p, resolution); + if (!grid2seq.count(k)) { + int dim = 0; + for (int d = 0; d < 3; ++d) { + if (std::abs(n[d]) > std::abs(n[dim])) + dim = d; + } + dim = (dim + 1) % 3; + grid2seq[k] = seq2grid.size(); + seq2grid.emplace_back(k); + seq2pos.push_back(p[0]); + seq2pos.push_back(p[1]); + seq2pos.push_back(p[2]); + for (int c = 0; c < feat_channel; ++c) { + seq2feat.emplace_back(f[c]); + } + seq2normal.emplace_back(dim); + } + } + } + data += (height * width * 4); + data_normal += (height * width * 3); + data_feat += (height * width * feat_channel); + } + } + + for (int i = 0; i < num_level - 1; ++i) { + DownsampleGrid(grids[i], grids[i + 1]); + } + + for (int l = 0; l < num_level; ++l) { + grids[l].seq2neighbor.resize(grids[l].seq2grid.size() * 9, -1); + grids[l].num_origin_seq = grids[l].seq2grid.size(); + for (int d = 0; d < 3; ++d) { + NeighborGrid(grids[l], view_layer_positions, d); + } + } + + for (int i = num_level - 2; i >= 0; --i) { + PadGrid(grids[i], grids[i + 1], view_layer_positions); + } + for (int i = grids[0].num_origin_seq; i < grids[0].seq2grid.size(); ++i) { + int k = grids[0].seq2grid[i]; + float p[3]; + key2pos(k, grids[0].resolution, p); + seq2pos.push_back(p[0]); + seq2pos.push_back(p[1]); + seq2pos.push_back(p[2]); + for (int c = 0; c < feat_channel; ++c) { + seq2feat.emplace_back(0.5); + } + } + + std::vector texture_positions(2); + std::vector texture_feats(1); + std::vector grid_neighbors(grids.size()); + std::vector grid_downsamples(grids.size() - 1); + std::vector grid_evencorners(grids.size()); + std::vector grid_oddcorners(grids.size()); + + texture_positions[0] = torch::zeros({static_cast(seq2pos.size() / 3), static_cast(3)}, float_options); + texture_positions[1] = torch::zeros({static_cast(seq2pos.size() / 3)}, float_options); + texture_feats[0] = torch::zeros({static_cast(seq2feat.size() / feat_channel), static_cast(feat_channel)}, float_options); + float* positions_out_ptr = texture_positions[0].data_ptr(); + memcpy(positions_out_ptr, seq2pos.data(), sizeof(float) * seq2pos.size()); + positions_out_ptr = texture_positions[1].data_ptr(); + for (int i = 0; i < grids[0].seq2grid.size(); ++i) { + positions_out_ptr[i] = (i < grids[0].num_origin_seq); + } + float* feats_out_ptr = texture_feats[0].data_ptr(); + memcpy(feats_out_ptr, seq2feat.data(), sizeof(float) * seq2feat.size()); + + for (int i = 0; i < grids.size(); ++i) { + grid_neighbors[i] = torch::zeros({static_cast(grids[i].seq2grid.size()), static_cast(9)}, int64_options); + int64_t* nptr = grid_neighbors[i].data_ptr(); + for (int j = 0; j < grids[i].seq2neighbor.size(); ++j) { + nptr[j] = grids[i].seq2neighbor[j]; + } + grid_evencorners[i] = torch::zeros({static_cast(grids[i].seq2evencorner.size())}, int64_options); + grid_oddcorners[i] = torch::zeros({static_cast(grids[i].seq2oddcorner.size())}, int64_options); + int64_t* dptr = grid_evencorners[i].data_ptr(); + for (int j = 0; j < grids[i].seq2evencorner.size(); ++j) { + dptr[j] = grids[i].seq2evencorner[j]; + } + dptr = grid_oddcorners[i].data_ptr(); + for (int j = 0; j < grids[i].seq2oddcorner.size(); ++j) { + dptr[j] = grids[i].seq2oddcorner[j]; + } + if (i + 1 < grids.size()) { + grid_downsamples[i] = torch::zeros({static_cast(grids[i].downsample_seq.size())}, int64_options); + int64_t* dptr = grid_downsamples[i].data_ptr(); + for (int j = 0; j < grids[i].downsample_seq.size(); ++j) { + dptr[j] = grids[i].downsample_seq[j]; + } + } + } + return {texture_positions, texture_feats, grid_neighbors, grid_downsamples, grid_evencorners, grid_oddcorners}; +} diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp new file mode 100644 index 0000000..4af6eeb --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.cpp @@ -0,0 +1,139 @@ +#include "rasterizer.h" + +void rasterizeTriangleCPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, INT64* zbuffer, float* d, float occlusion_truncation) { + float x_min = std::min(vt0[0], std::min(vt1[0],vt2[0])); + float x_max = std::max(vt0[0], std::max(vt1[0],vt2[0])); + float y_min = std::min(vt0[1], std::min(vt1[1],vt2[1])); + float y_max = std::max(vt0[1], std::max(vt1[1],vt2[1])); + + for (int px = x_min; px < x_max + 1; ++px) { + if (px < 0 || px >= width) + continue; + for (int py = y_min; py < y_max + 1; ++py) { + if (py < 0 || py >= height) + continue; + float vt[2] = {px + 0.5, py + 0.5}; + float baryCentricCoordinate[3]; + calculateBarycentricCoordinate(vt0, vt1, vt2, vt, baryCentricCoordinate); + if (isBarycentricCoordInBounds(baryCentricCoordinate)) { + int pixel = py * width + px; + if (zbuffer == 0) { + zbuffer[pixel] = (INT64)(idx + 1); + continue; + } + + float depth = baryCentricCoordinate[0] * vt0[2] + baryCentricCoordinate[1] * vt1[2] + baryCentricCoordinate[2] * vt2[2]; + float depth_thres = 0; + if (d) { + depth_thres = d[pixel] * 0.49999f + 0.5f + occlusion_truncation; + } + + int z_quantize = depth * (2<<17); + INT64 token = (INT64)z_quantize * MAXINT + (INT64)(idx + 1); + if (depth < depth_thres) + continue; + zbuffer[pixel] = std::min(zbuffer[pixel], token); + } + } + } +} + +void barycentricFromImgcoordCPU(float* V, int* F, int* findices, INT64* zbuffer, int width, int height, int num_vertices, int num_faces, + float* barycentric_map, int pix) +{ + INT64 f = zbuffer[pix] % MAXINT; + if (f == (MAXINT-1)) { + findices[pix] = 0; + barycentric_map[pix * 3] = 0; + barycentric_map[pix * 3 + 1] = 0; + barycentric_map[pix * 3 + 2] = 0; + return; + } + findices[pix] = f; + f -= 1; + float barycentric[3] = {0, 0, 0}; + if (f >= 0) { + float vt[2] = {float(pix % width) + 0.5f, float(pix / width) + 0.5f}; + float* vt0_ptr = V + (F[f * 3] * 4); + float* vt1_ptr = V + (F[f * 3 + 1] * 4); + float* vt2_ptr = V + (F[f * 3 + 2] * 4); + + float vt0[2] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f}; + float vt1[2] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f}; + float vt2[2] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f}; + + calculateBarycentricCoordinate(vt0, vt1, vt2, vt, barycentric); + + barycentric[0] = barycentric[0] / vt0_ptr[3]; + barycentric[1] = barycentric[1] / vt1_ptr[3]; + barycentric[2] = barycentric[2] / vt2_ptr[3]; + float w = 1.0f / (barycentric[0] + barycentric[1] + barycentric[2]); + barycentric[0] *= w; + barycentric[1] *= w; + barycentric[2] *= w; + + } + barycentric_map[pix * 3] = barycentric[0]; + barycentric_map[pix * 3 + 1] = barycentric[1]; + barycentric_map[pix * 3 + 2] = barycentric[2]; +} + +void rasterizeImagecoordsKernelCPU(float* V, int* F, float* d, INT64* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces, int f) +{ + float* vt0_ptr = V + (F[f * 3] * 4); + float* vt1_ptr = V + (F[f * 3 + 1] * 4); + float* vt2_ptr = V + (F[f * 3 + 2] * 4); + + float vt0[3] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f, vt0_ptr[2] / vt0_ptr[3] * 0.49999f + 0.5f}; + float vt1[3] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f, vt1_ptr[2] / vt1_ptr[3] * 0.49999f + 0.5f}; + float vt2[3] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f, vt2_ptr[2] / vt2_ptr[3] * 0.49999f + 0.5f}; + + rasterizeTriangleCPU(f, vt0, vt1, vt2, width, height, zbuffer, d, occlusion_trunc); +} + +std::vector rasterize_image_cpu(torch::Tensor V, torch::Tensor F, torch::Tensor D, + int width, int height, float occlusion_truncation, int use_depth_prior) +{ + int num_faces = F.size(0); + int num_vertices = V.size(0); + auto options = torch::TensorOptions().dtype(torch::kInt32).requires_grad(false); + auto INT64_options = torch::TensorOptions().dtype(torch::kInt64).requires_grad(false); + auto findices = torch::zeros({height, width}, options); + INT64 maxint = (INT64)MAXINT * (INT64)MAXINT + (MAXINT - 1); + auto z_min = torch::ones({height, width}, INT64_options) * (int64_t)maxint; + + if (!use_depth_prior) { + for (int i = 0; i < num_faces; ++i) { + rasterizeImagecoordsKernelCPU(V.data_ptr(), F.data_ptr(), 0, + (INT64*)z_min.data_ptr(), occlusion_truncation, width, height, num_vertices, num_faces, i); + } + } else { + for (int i = 0; i < num_faces; ++i) + rasterizeImagecoordsKernelCPU(V.data_ptr(), F.data_ptr(), D.data_ptr(), + (INT64*)z_min.data_ptr(), occlusion_truncation, width, height, num_vertices, num_faces, i); + } + + auto float_options = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false); + auto barycentric = torch::zeros({height, width, 3}, float_options); + for (int i = 0; i < width * height; ++i) + barycentricFromImgcoordCPU(V.data_ptr(), F.data_ptr(), + findices.data_ptr(), (INT64*)z_min.data_ptr(), width, height, num_vertices, num_faces, barycentric.data_ptr(), i); + + return {findices, barycentric}; +} + +std::vector rasterize_image(torch::Tensor V, torch::Tensor F, torch::Tensor D, + int width, int height, float occlusion_truncation, int use_depth_prior) +{ + int device_id = V.get_device(); + if (device_id == -1) + return rasterize_image_cpu(V, F, D, width, height, occlusion_truncation, use_depth_prior); + else + return rasterize_image_gpu(V, F, D, width, height, occlusion_truncation, use_depth_prior); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("rasterize_image", &rasterize_image, "Custom image rasterization"); + m.def("build_hierarchy", &build_hierarchy, "Custom image rasterization"); + m.def("build_hierarchy_with_feat", &build_hierarchy_with_feat, "Custom image rasterization"); +} diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h new file mode 100644 index 0000000..cf4f987 --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer.h @@ -0,0 +1,54 @@ +#ifndef RASTERIZER_H_ +#define RASTERIZER_H_ + +#include +#include +#include +#include // For CUDA context + +#define INT64 unsigned long long +#define MAXINT 2147483647 + +__host__ __device__ inline float calculateSignedArea2(float* a, float* b, float* c) { + return ((c[0] - a[0]) * (b[1] - a[1]) - (b[0] - a[0]) * (c[1] - a[1])); +} + +__host__ __device__ inline void calculateBarycentricCoordinate(float* a, float* b, float* c, float* p, + float* barycentric) +{ + float beta_tri = calculateSignedArea2(a, p, c); + float gamma_tri = calculateSignedArea2(a, b, p); + float area = calculateSignedArea2(a, b, c); + if (area == 0) { + barycentric[0] = -1.0; + barycentric[1] = -1.0; + barycentric[2] = -1.0; + return; + } + float tri_inv = 1.0 / area; + float beta = beta_tri * tri_inv; + float gamma = gamma_tri * tri_inv; + float alpha = 1.0 - beta - gamma; + barycentric[0] = alpha; + barycentric[1] = beta; + barycentric[2] = gamma; +} + +__host__ __device__ inline bool isBarycentricCoordInBounds(float* barycentricCoord) { + return barycentricCoord[0] >= 0.0 && barycentricCoord[0] <= 1.0 && + barycentricCoord[1] >= 0.0 && barycentricCoord[1] <= 1.0 && + barycentricCoord[2] >= 0.0 && barycentricCoord[2] <= 1.0; +} + +std::vector rasterize_image_gpu(torch::Tensor V, torch::Tensor F, torch::Tensor D, + int width, int height, float occlusion_truncation, int use_depth_prior); + +std::vector> build_hierarchy(std::vector view_layer_positions, std::vector view_layer_normals, int num_level, int resolution); + +std::vector> build_hierarchy_with_feat( + std::vector view_layer_positions, + std::vector view_layer_normals, + std::vector view_layer_feats, + int num_level, int resolution); + +#endif \ No newline at end of file diff --git a/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu new file mode 100644 index 0000000..cc6f354 --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/lib/custom_rasterizer_kernel/rasterizer_gpu.cu @@ -0,0 +1,127 @@ +#include "rasterizer.h" + +__device__ void rasterizeTriangleGPU(int idx, float* vt0, float* vt1, float* vt2, int width, int height, INT64* zbuffer, float* d, float occlusion_truncation) { + float x_min = std::min(vt0[0], std::min(vt1[0],vt2[0])); + float x_max = std::max(vt0[0], std::max(vt1[0],vt2[0])); + float y_min = std::min(vt0[1], std::min(vt1[1],vt2[1])); + float y_max = std::max(vt0[1], std::max(vt1[1],vt2[1])); + + for (int px = x_min; px < x_max + 1; ++px) { + if (px < 0 || px >= width) + continue; + for (int py = y_min; py < y_max + 1; ++py) { + if (py < 0 || py >= height) + continue; + float vt[2] = {px + 0.5f, py + 0.5f}; + float baryCentricCoordinate[3]; + calculateBarycentricCoordinate(vt0, vt1, vt2, vt, baryCentricCoordinate); + if (isBarycentricCoordInBounds(baryCentricCoordinate)) { + int pixel = py * width + px; + if (zbuffer == 0) { + atomicExch(&zbuffer[pixel], (INT64)(idx + 1)); + continue; + } + float depth = baryCentricCoordinate[0] * vt0[2] + baryCentricCoordinate[1] * vt1[2] + baryCentricCoordinate[2] * vt2[2]; + float depth_thres = 0; + if (d) { + depth_thres = d[pixel] * 0.49999f + 0.5f + occlusion_truncation; + } + + int z_quantize = depth * (2<<17); + INT64 token = (INT64)z_quantize * MAXINT + (INT64)(idx + 1); + if (depth < depth_thres) + continue; + atomicMin(&zbuffer[pixel], token); + } + } + } +} + +__global__ void barycentricFromImgcoordGPU(float* V, int* F, int* findices, INT64* zbuffer, int width, int height, int num_vertices, int num_faces, + float* barycentric_map) +{ + int pix = blockIdx.x * blockDim.x + threadIdx.x; + if (pix >= width * height) + return; + INT64 f = zbuffer[pix] % MAXINT; + if (f == (MAXINT-1)) { + findices[pix] = 0; + barycentric_map[pix * 3] = 0; + barycentric_map[pix * 3 + 1] = 0; + barycentric_map[pix * 3 + 2] = 0; + return; + } + findices[pix] = f; + f -= 1; + float barycentric[3] = {0, 0, 0}; + if (f >= 0) { + float vt[2] = {float(pix % width) + 0.5f, float(pix / width) + 0.5f}; + float* vt0_ptr = V + (F[f * 3] * 4); + float* vt1_ptr = V + (F[f * 3 + 1] * 4); + float* vt2_ptr = V + (F[f * 3 + 2] * 4); + + float vt0[2] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f}; + float vt1[2] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f}; + float vt2[2] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f}; + + calculateBarycentricCoordinate(vt0, vt1, vt2, vt, barycentric); + + barycentric[0] = barycentric[0] / vt0_ptr[3]; + barycentric[1] = barycentric[1] / vt1_ptr[3]; + barycentric[2] = barycentric[2] / vt2_ptr[3]; + float w = 1.0f / (barycentric[0] + barycentric[1] + barycentric[2]); + barycentric[0] *= w; + barycentric[1] *= w; + barycentric[2] *= w; + + } + barycentric_map[pix * 3] = barycentric[0]; + barycentric_map[pix * 3 + 1] = barycentric[1]; + barycentric_map[pix * 3 + 2] = barycentric[2]; +} + +__global__ void rasterizeImagecoordsKernelGPU(float* V, int* F, float* d, INT64* zbuffer, float occlusion_trunc, int width, int height, int num_vertices, int num_faces) +{ + int f = blockIdx.x * blockDim.x + threadIdx.x; + if (f >= num_faces) + return; + + float* vt0_ptr = V + (F[f * 3] * 4); + float* vt1_ptr = V + (F[f * 3 + 1] * 4); + float* vt2_ptr = V + (F[f * 3 + 2] * 4); + + float vt0[3] = {(vt0_ptr[0] / vt0_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt0_ptr[1] / vt0_ptr[3]) * (height - 1) + 0.5f, vt0_ptr[2] / vt0_ptr[3] * 0.49999f + 0.5f}; + float vt1[3] = {(vt1_ptr[0] / vt1_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt1_ptr[1] / vt1_ptr[3]) * (height - 1) + 0.5f, vt1_ptr[2] / vt1_ptr[3] * 0.49999f + 0.5f}; + float vt2[3] = {(vt2_ptr[0] / vt2_ptr[3] * 0.5f + 0.5f) * (width - 1) + 0.5f, (0.5f + 0.5f * vt2_ptr[1] / vt2_ptr[3]) * (height - 1) + 0.5f, vt2_ptr[2] / vt2_ptr[3] * 0.49999f + 0.5f}; + + rasterizeTriangleGPU(f, vt0, vt1, vt2, width, height, zbuffer, d, occlusion_trunc); +} + +std::vector rasterize_image_gpu(torch::Tensor V, torch::Tensor F, torch::Tensor D, + int width, int height, float occlusion_truncation, int use_depth_prior) +{ + int device_id = V.get_device(); + cudaSetDevice(device_id); + int num_faces = F.size(0); + int num_vertices = V.size(0); + auto options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA, device_id).requires_grad(false); + auto INT64_options = torch::TensorOptions().dtype(torch::kInt64).device(torch::kCUDA, device_id).requires_grad(false); + auto findices = torch::zeros({height, width}, options); + INT64 maxint = (INT64)MAXINT * (INT64)MAXINT + (MAXINT - 1); + auto z_min = torch::ones({height, width}, INT64_options) * (int64_t)maxint; + + if (!use_depth_prior) { + rasterizeImagecoordsKernelGPU<<<(num_faces+255)/256,256,0,at::cuda::getCurrentCUDAStream()>>>(V.data_ptr(), F.data_ptr(), 0, + (INT64*)z_min.data_ptr(), occlusion_truncation, width, height, num_vertices, num_faces); + } else { + rasterizeImagecoordsKernelGPU<<<(num_faces+255)/256,256,0,at::cuda::getCurrentCUDAStream()>>>(V.data_ptr(), F.data_ptr(), D.data_ptr(), + (INT64*)z_min.data_ptr(), occlusion_truncation, width, height, num_vertices, num_faces); + } + + auto float_options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA, device_id).requires_grad(false); + auto barycentric = torch::zeros({height, width, 3}, float_options); + barycentricFromImgcoordGPU<<<(width * height + 255)/256, 256>>>(V.data_ptr(), F.data_ptr(), + findices.data_ptr(), (INT64*)z_min.data_ptr(), width, height, num_vertices, num_faces, barycentric.data_ptr()); + + return {findices, barycentric}; +} diff --git a/hy3dgen/texgen/custom_rasterizer/setup.py b/hy3dgen/texgen/custom_rasterizer/setup.py new file mode 100644 index 0000000..3e312a7 --- /dev/null +++ b/hy3dgen/texgen/custom_rasterizer/setup.py @@ -0,0 +1,26 @@ +from setuptools import setup, find_packages +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +# build custom rasterizer +# build with `python setup.py install` +# nvcc is needed + +custom_rasterizer_module = CUDAExtension('custom_rasterizer_kernel', [ + 'lib/custom_rasterizer_kernel/rasterizer.cpp', + 'lib/custom_rasterizer_kernel/grid_neighbor.cpp', + 'lib/custom_rasterizer_kernel/rasterizer_gpu.cu', +]) + +setup( + packages=find_packages(), + version='0.1', + name='custom_rasterizer', + include_package_data=True, + package_dir={'': '.'}, + ext_modules=[ + custom_rasterizer_module, + ], + cmdclass={ + 'build_ext': BuildExtension + } +) diff --git a/hy3dgen/texgen/differentiable_renderer/__init__.py b/hy3dgen/texgen/differentiable_renderer/__init__.py new file mode 100644 index 0000000..8bb2bf8 --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/__init__.py @@ -0,0 +1,13 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. \ No newline at end of file diff --git a/hy3dgen/texgen/differentiable_renderer/camera_utils.py b/hy3dgen/texgen/differentiable_renderer/camera_utils.py new file mode 100644 index 0000000..b67727c --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/camera_utils.py @@ -0,0 +1,106 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import math + +import numpy as np +import torch + + +def transform_pos(mtx, pos, keepdim=False): + t_mtx = torch.from_numpy(mtx).to( + pos.device) if isinstance( + mtx, np.ndarray) else mtx + if pos.shape[-1] == 3: + posw = torch.cat( + [pos, torch.ones([pos.shape[0], 1]).to(pos.device)], axis=1) + else: + posw = pos + + if keepdim: + return torch.matmul(posw, t_mtx.t())[...] + else: + return torch.matmul(posw, t_mtx.t())[None, ...] + + +def get_mv_matrix(elev, azim, camera_distance, center=None): + elev = -elev + azim += 90 + + elev_rad = math.radians(elev) + azim_rad = math.radians(azim) + + camera_position = np.array([camera_distance * math.cos(elev_rad) * math.cos(azim_rad), + camera_distance * + math.cos(elev_rad) * math.sin(azim_rad), + camera_distance * math.sin(elev_rad)]) + + if center is None: + center = np.array([0, 0, 0]) + else: + center = np.array(center) + + lookat = center - camera_position + lookat = lookat / np.linalg.norm(lookat) + + up = np.array([0, 0, 1.0]) + right = np.cross(lookat, up) + right = right / np.linalg.norm(right) + up = np.cross(right, lookat) + up = up / np.linalg.norm(up) + + c2w = np.concatenate( + [np.stack([right, up, -lookat], axis=-1), camera_position[:, None]], axis=-1) + + w2c = np.zeros((4, 4)) + w2c[:3, :3] = np.transpose(c2w[:3, :3], (1, 0)) + w2c[:3, 3:] = -np.matmul(np.transpose(c2w[:3, :3], (1, 0)), c2w[:3, 3:]) + w2c[3, 3] = 1.0 + + return w2c.astype(np.float32) + + +def get_orthographic_projection_matrix( + left=-1, right=1, bottom=-1, top=1, near=0, far=2): + """ + 计算正交投影矩阵。 + + 参数: + left (float): 投影区域左侧边界。 + right (float): 投影区域右侧边界。 + bottom (float): 投影区域底部边界。 + top (float): 投影区域顶部边界。 + near (float): 投影区域近裁剪面距离。 + far (float): 投影区域远裁剪面距离。 + + 返回: + numpy.ndarray: 正交投影矩阵。 + """ + ortho_matrix = np.eye(4, dtype=np.float32) + ortho_matrix[0, 0] = 2 / (right - left) + ortho_matrix[1, 1] = 2 / (top - bottom) + ortho_matrix[2, 2] = -2 / (far - near) + ortho_matrix[0, 3] = -(right + left) / (right - left) + ortho_matrix[1, 3] = -(top + bottom) / (top - bottom) + ortho_matrix[2, 3] = -(far + near) / (far - near) + return ortho_matrix + + +def get_perspective_projection_matrix(fovy, aspect_wh, near, far): + fovy_rad = math.radians(fovy) + return np.array([[1.0 / (math.tan(fovy_rad / 2.0) * aspect_wh), 0, 0, 0], + [0, 1.0 / math.tan(fovy_rad / 2.0), 0, 0], + [0, 0, -(far + near) / (far - near), - + 2.0 * far * near / (far - near)], + [0, 0, -1, 0]]).astype(np.float32) diff --git a/hy3dgen/texgen/differentiable_renderer/compile_mesh_painter.bat b/hy3dgen/texgen/differentiable_renderer/compile_mesh_painter.bat new file mode 100644 index 0000000..3947b0f --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/compile_mesh_painter.bat @@ -0,0 +1,3 @@ +FOR /F "tokens=*" %%i IN ('python -m pybind11 --includes') DO SET PYINCLUDES=%%i +echo %PYINCLUDES% +g++ -O3 -Wall -shared -std=c++11 -fPIC %PYINCLUDES% mesh_processor.cpp -o mesh_processor.pyd -lpython3.12 \ No newline at end of file diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.cpp b/hy3dgen/texgen/differentiable_renderer/mesh_processor.cpp new file mode 100644 index 0000000..ca8650f --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.cpp @@ -0,0 +1,161 @@ +#include +#include +#include +#include +#include +#include +#include + +namespace py = pybind11; +using namespace std; + +std::pair, + py::array_t> meshVerticeInpaint_smooth(py::array_t texture, +py::array_t mask, + py::array_t vtx_pos, py::array_t vtx_uv, + py::array_t pos_idx, py::array_t uv_idx) { + auto texture_buf = texture.request(); + auto mask_buf = mask.request(); + auto vtx_pos_buf = vtx_pos.request(); + auto vtx_uv_buf = vtx_uv.request(); + auto pos_idx_buf = pos_idx.request(); + auto uv_idx_buf = uv_idx.request(); + + int texture_height = texture_buf.shape[0]; + int texture_width = texture_buf.shape[1]; + int texture_channel = texture_buf.shape[2]; + float* texture_ptr = static_cast(texture_buf.ptr); + uint8_t* mask_ptr = static_cast(mask_buf.ptr); + + int vtx_num = vtx_pos_buf.shape[0]; + float* vtx_pos_ptr = static_cast(vtx_pos_buf.ptr); + float* vtx_uv_ptr = static_cast(vtx_uv_buf.ptr); + int* pos_idx_ptr = static_cast(pos_idx_buf.ptr); + int* uv_idx_ptr = static_cast(uv_idx_buf.ptr); + + vector vtx_mask(vtx_num, 0.0f); + vector> vtx_color(vtx_num, vector(texture_channel, 0.0f)); + vector uncolored_vtxs; + + vector> G(vtx_num); + + for (int i = 0; i < uv_idx_buf.shape[0]; ++i) { + for (int k = 0; k < 3; ++k) { + int vtx_uv_idx = uv_idx_ptr[i * 3 + k]; + int vtx_idx = pos_idx_ptr[i * 3 + k]; + int uv_v = round(vtx_uv_ptr[vtx_uv_idx * 2] * (texture_width - 1)); + int uv_u = round((1.0 - vtx_uv_ptr[vtx_uv_idx * 2 + 1]) * (texture_height - 1)); + + if (mask_ptr[uv_u * texture_width + uv_v] > 0) { + vtx_mask[vtx_idx] = 1.0f; + for (int c = 0; c < texture_channel; ++c) { + vtx_color[vtx_idx][c] = texture_ptr[(uv_u * texture_width + uv_v) * texture_channel + c]; + } + }else{ + uncolored_vtxs.push_back(vtx_idx); + } + + G[pos_idx_ptr[i * 3 + k]].push_back(pos_idx_ptr[i * 3 + (k + 1) % 3]); + } + } + + int smooth_count = 2; + int last_uncolored_vtx_count = 0; + while (smooth_count>0) { + int uncolored_vtx_count = 0; + + for (int vtx_idx : uncolored_vtxs) { + + vector sum_color(texture_channel, 0.0f); + float total_weight = 0.0f; + + array vtx_0 = {vtx_pos_ptr[vtx_idx * 3], +vtx_pos_ptr[vtx_idx * 3 + 1], vtx_pos_ptr[vtx_idx * 3 + 2]}; + for (int connected_idx : G[vtx_idx]) { + if (vtx_mask[connected_idx] > 0) { + array vtx1 = {vtx_pos_ptr[connected_idx * 3], + vtx_pos_ptr[connected_idx * 3 + 1], vtx_pos_ptr[connected_idx * 3 + 2]}; + float dist_weight = 1.0f / max(sqrt(pow(vtx_0[0] - vtx1[0], 2) + pow(vtx_0[1] - vtx1[1], 2) + \ + pow(vtx_0[2] - vtx1[2], 2)), 1E-4); + dist_weight = dist_weight * dist_weight; + for (int c = 0; c < texture_channel; ++c) { + sum_color[c] += vtx_color[connected_idx][c] * dist_weight; + } + total_weight += dist_weight; + } + } + + if (total_weight > 0.0f) { + for (int c = 0; c < texture_channel; ++c) { + vtx_color[vtx_idx][c] = sum_color[c] / total_weight; + } + vtx_mask[vtx_idx] = 1.0f; + } else { + uncolored_vtx_count++; + } + + } + + if(last_uncolored_vtx_count==uncolored_vtx_count){ + smooth_count--; + }else{ + smooth_count++; + } + last_uncolored_vtx_count = uncolored_vtx_count; + } + + // Create new arrays for the output + py::array_t new_texture(texture_buf.size); + py::array_t new_mask(mask_buf.size); + + auto new_texture_buf = new_texture.request(); + auto new_mask_buf = new_mask.request(); + + float* new_texture_ptr = static_cast(new_texture_buf.ptr); + uint8_t* new_mask_ptr = static_cast(new_mask_buf.ptr); + // Copy original texture and mask to new arrays + std::copy(texture_ptr, texture_ptr + texture_buf.size, new_texture_ptr); + std::copy(mask_ptr, mask_ptr + mask_buf.size, new_mask_ptr); + + for (int face_idx = 0; face_idx < uv_idx_buf.shape[0]; ++face_idx) { + for (int k = 0; k < 3; ++k) { + int vtx_uv_idx = uv_idx_ptr[face_idx * 3 + k]; + int vtx_idx = pos_idx_ptr[face_idx * 3 + k]; + + if (vtx_mask[vtx_idx] == 1.0f) { + int uv_v = round(vtx_uv_ptr[vtx_uv_idx * 2] * (texture_width - 1)); + int uv_u = round((1.0 - vtx_uv_ptr[vtx_uv_idx * 2 + 1]) * (texture_height - 1)); + + for (int c = 0; c < texture_channel; ++c) { + new_texture_ptr[(uv_u * texture_width + uv_v) * texture_channel + c] = vtx_color[vtx_idx][c]; + } + new_mask_ptr[uv_u * texture_width + uv_v] = 255; + } + } + } + + // Reshape the new arrays to match the original texture and mask shapes + new_texture.resize({texture_height, texture_width, 3}); + new_mask.resize({texture_height, texture_width}); + return std::make_pair(new_texture, new_mask); +} + + +std::pair, py::array_t> meshVerticeInpaint(py::array_t texture, + py::array_t mask, + py::array_t vtx_pos, py::array_t vtx_uv, + py::array_t pos_idx, py::array_t uv_idx, const std::string& method = "smooth") { + if (method == "smooth") { + return meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx); + } else { + throw std::invalid_argument("Invalid method. Use 'smooth' or 'forward'."); + } +} + +PYBIND11_MODULE(mesh_processor, m) { + m.def("meshVerticeInpaint", &meshVerticeInpaint, "A function to process mesh", + py::arg("texture"), py::arg("mask"), + py::arg("vtx_pos"), py::arg("vtx_uv"), + py::arg("pos_idx"), py::arg("uv_idx"), + py::arg("method") = "smooth"); +} \ No newline at end of file diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_processor.py b/hy3dgen/texgen/differentiable_renderer/mesh_processor.py new file mode 100644 index 0000000..5a731cc --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.py @@ -0,0 +1,84 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import numpy as np + +def meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx): + texture_height, texture_width, texture_channel = texture.shape + vtx_num = vtx_pos.shape[0] + + vtx_mask = np.zeros(vtx_num, dtype=np.float32) + vtx_color = [np.zeros(texture_channel, dtype=np.float32) for _ in range(vtx_num)] + uncolored_vtxs = [] + G = [[] for _ in range(vtx_num)] + + for i in range(uv_idx.shape[0]): + for k in range(3): + vtx_uv_idx = uv_idx[i, k] + vtx_idx = pos_idx[i, k] + uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1))) + uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1))) + if mask[uv_u, uv_v] > 0: + vtx_mask[vtx_idx] = 1.0 + vtx_color[vtx_idx] = texture[uv_u, uv_v] + else: + uncolored_vtxs.append(vtx_idx) + G[pos_idx[i, k]].append(pos_idx[i, (k + 1) % 3]) + + smooth_count = 2 + last_uncolored_vtx_count = 0 + while smooth_count > 0: + uncolored_vtx_count = 0 + for vtx_idx in uncolored_vtxs: + sum_color = np.zeros(texture_channel, dtype=np.float32) + total_weight = 0.0 + vtx_0 = vtx_pos[vtx_idx] + for connected_idx in G[vtx_idx]: + if vtx_mask[connected_idx] > 0: + vtx1 = vtx_pos[connected_idx] + dist = np.sqrt(np.sum((vtx_0 - vtx1) ** 2)) + dist_weight = 1.0 / max(dist, 1e-4) + dist_weight *= dist_weight + sum_color += vtx_color[connected_idx] * dist_weight + total_weight += dist_weight + if total_weight > 0: + vtx_color[vtx_idx] = sum_color / total_weight + vtx_mask[vtx_idx] = 1.0 + else: + uncolored_vtx_count += 1 + + if last_uncolored_vtx_count == uncolored_vtx_count: + smooth_count -= 1 + else: + smooth_count += 1 + last_uncolored_vtx_count = uncolored_vtx_count + + new_texture = texture.copy() + new_mask = mask.copy() + for face_idx in range(uv_idx.shape[0]): + for k in range(3): + vtx_uv_idx = uv_idx[face_idx, k] + vtx_idx = pos_idx[face_idx, k] + if vtx_mask[vtx_idx] == 1.0: + uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1))) + uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1))) + new_texture[uv_u, uv_v] = vtx_color[vtx_idx] + new_mask[uv_u, uv_v] = 255 + return new_texture, new_mask + +def meshVerticeInpaint(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx, method="smooth"): + if method == "smooth": + return meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx) + else: + raise ValueError("Invalid method. Use 'smooth' or 'forward'.") \ No newline at end of file diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_render.py b/hy3dgen/texgen/differentiable_renderer/mesh_render.py new file mode 100644 index 0000000..6f83a36 --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/mesh_render.py @@ -0,0 +1,823 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import cv2 +import numpy as np +import torch +import torch.nn.functional as F +import trimesh +from PIL import Image + +from .camera_utils import ( + transform_pos, + get_mv_matrix, + get_orthographic_projection_matrix, + get_perspective_projection_matrix, +) +from .mesh_processor import meshVerticeInpaint +from .mesh_utils import load_mesh, save_mesh + + +def stride_from_shape(shape): + stride = [1] + for x in reversed(shape[1:]): + stride.append(stride[-1] * x) + return list(reversed(stride)) + + +def scatter_add_nd_with_count(input, count, indices, values, weights=None): + # input: [..., C], D dimension + C channel + # count: [..., 1], D dimension + # indices: [N, D], long + # values: [N, C] + + D = indices.shape[-1] + C = input.shape[-1] + size = input.shape[:-1] + stride = stride_from_shape(size) + + assert len(size) == D + + input = input.view(-1, C) # [HW, C] + count = count.view(-1, 1) + + flatten_indices = (indices * torch.tensor(stride, + dtype=torch.long, device=indices.device)).sum(-1) # [N] + + if weights is None: + weights = torch.ones_like(values[..., :1]) + + input.scatter_add_(0, flatten_indices.unsqueeze(1).repeat(1, C), values) + count.scatter_add_(0, flatten_indices.unsqueeze(1), weights) + + return input.view(*size, C), count.view(*size, 1) + + +def linear_grid_put_2d(H, W, coords, values, return_count=False): + # coords: [N, 2], float in [0, 1] + # values: [N, C] + + C = values.shape[-1] + + indices = coords * torch.tensor( + [H - 1, W - 1], dtype=torch.float32, device=coords.device + ) + indices_00 = indices.floor().long() # [N, 2] + indices_00[:, 0].clamp_(0, H - 2) + indices_00[:, 1].clamp_(0, W - 2) + indices_01 = indices_00 + torch.tensor( + [0, 1], dtype=torch.long, device=indices.device + ) + indices_10 = indices_00 + torch.tensor( + [1, 0], dtype=torch.long, device=indices.device + ) + indices_11 = indices_00 + torch.tensor( + [1, 1], dtype=torch.long, device=indices.device + ) + + h = indices[..., 0] - indices_00[..., 0].float() + w = indices[..., 1] - indices_00[..., 1].float() + w_00 = (1 - h) * (1 - w) + w_01 = (1 - h) * w + w_10 = h * (1 - w) + w_11 = h * w + + result = torch.zeros(H, W, C, device=values.device, + dtype=values.dtype) # [H, W, C] + count = torch.zeros(H, W, 1, device=values.device, + dtype=values.dtype) # [H, W, 1] + weights = torch.ones_like(values[..., :1]) # [N, 1] + + result, count = scatter_add_nd_with_count( + result, count, indices_00, values * w_00.unsqueeze(1), weights * w_00.unsqueeze(1)) + result, count = scatter_add_nd_with_count( + result, count, indices_01, values * w_01.unsqueeze(1), weights * w_01.unsqueeze(1)) + result, count = scatter_add_nd_with_count( + result, count, indices_10, values * w_10.unsqueeze(1), weights * w_10.unsqueeze(1)) + result, count = scatter_add_nd_with_count( + result, count, indices_11, values * w_11.unsqueeze(1), weights * w_11.unsqueeze(1)) + + if return_count: + return result, count + + mask = (count.squeeze(-1) > 0) + result[mask] = result[mask] / count[mask].repeat(1, C) + + return result + + +class MeshRender(): + def __init__( + self, + camera_distance=1.45, camera_type='orth', + default_resolution=1024, texture_size=1024, + use_antialias=True, max_mip_level=None, filter_mode='linear', + bake_mode='linear', raster_mode='cr', device='cuda'): + + self.device = device + + self.set_default_render_resolution(default_resolution) + self.set_default_texture_resolution(texture_size) + + self.camera_distance = camera_distance + self.use_antialias = use_antialias + self.max_mip_level = max_mip_level + self.filter_mode = filter_mode + + self.bake_angle_thres = 75 + self.bake_unreliable_kernel_size = int( + (2 / 512) * max(self.default_resolution[0], self.default_resolution[1])) + self.bake_mode = bake_mode + + self.raster_mode = raster_mode + if self.raster_mode == 'cr': + import custom_rasterizer as cr + self.raster = cr + else: + raise f'No raster named {self.raster_mode}' + + if camera_type == 'orth': + self.ortho_scale = 1.2 + self.camera_proj_mat = get_orthographic_projection_matrix( + left=-self.ortho_scale * 0.5, right=self.ortho_scale * 0.5, + bottom=-self.ortho_scale * 0.5, top=self.ortho_scale * 0.5, + near=0.1, far=100 + ) + elif camera_type == 'perspective': + self.camera_proj_mat = get_perspective_projection_matrix( + 49.13, self.default_resolution[1] / self.default_resolution[0], + 0.01, 100.0 + ) + else: + raise f'No camera type {camera_type}' + + def raster_rasterize(self, pos, tri, resolution, ranges=None, grad_db=True): + + if self.raster_mode == 'cr': + rast_out_db = None + if pos.dim() == 2: + pos = pos.unsqueeze(0) + findices, barycentric = self.raster.rasterize(pos, tri, resolution) + rast_out = torch.cat((barycentric, findices.unsqueeze(-1)), dim=-1) + rast_out = rast_out.unsqueeze(0) + else: + raise f'No raster named {self.raster_mode}' + + return rast_out, rast_out_db + + def raster_interpolate(self, uv, rast_out, uv_idx, rast_db=None, diff_attrs=None): + + if self.raster_mode == 'cr': + textd = None + barycentric = rast_out[0, ..., :-1] + findices = rast_out[0, ..., -1] + if uv.dim() == 2: + uv = uv.unsqueeze(0) + textc = self.raster.interpolate(uv, findices, barycentric, uv_idx) + else: + raise f'No raster named {self.raster_mode}' + + return textc, textd + + def raster_texture(self, tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto', + boundary_mode='wrap', max_mip_level=None): + + if self.raster_mode == 'cr': + raise f'Texture is not implemented in cr' + else: + raise f'No raster named {self.raster_mode}' + + return color + + def raster_antialias(self, color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0): + + if self.raster_mode == 'cr': + # Antialias has not been supported yet + color = color + else: + raise f'No raster named {self.raster_mode}' + + return color + + def load_mesh( + self, + mesh, + scale_factor=1.15, + auto_center=True, + ): + vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data = load_mesh(mesh) + self.mesh_copy = mesh + self.set_mesh(vtx_pos, pos_idx, + vtx_uv=vtx_uv, uv_idx=uv_idx, + scale_factor=scale_factor, auto_center=auto_center + ) + if texture_data is not None: + self.set_texture(texture_data) + + def save_mesh(self): + texture_data = self.get_texture() + texture_data = Image.fromarray((texture_data * 255).astype(np.uint8)) + return save_mesh(self.mesh_copy, texture_data) + + def set_mesh( + self, + vtx_pos, pos_idx, + vtx_uv=None, uv_idx=None, + scale_factor=1.15, auto_center=True + ): + + self.vtx_pos = torch.from_numpy(vtx_pos).to(self.device).float() + self.pos_idx = torch.from_numpy(pos_idx).to(self.device).to(torch.int) + if (vtx_uv is not None) and (uv_idx is not None): + self.vtx_uv = torch.from_numpy(vtx_uv).to(self.device).float() + self.uv_idx = torch.from_numpy(uv_idx).to(self.device).to(torch.int) + else: + self.vtx_uv = None + self.uv_idx = None + + self.vtx_pos[:, [0, 1]] = -self.vtx_pos[:, [0, 1]] + self.vtx_pos[:, [1, 2]] = self.vtx_pos[:, [2, 1]] + if (vtx_uv is not None) and (uv_idx is not None): + self.vtx_uv[:, 1] = 1.0 - self.vtx_uv[:, 1] + + if auto_center: + max_bb = (self.vtx_pos - 0).max(0)[0] + min_bb = (self.vtx_pos - 0).min(0)[0] + center = (max_bb + min_bb) / 2 + scale = torch.norm(self.vtx_pos - center, dim=1).max() * 2.0 + self.vtx_pos = (self.vtx_pos - center) * \ + (scale_factor / float(scale)) + self.scale_factor = scale_factor + + def set_texture(self, tex): + if isinstance(tex, np.ndarray): + tex = Image.fromarray((tex * 255).astype(np.uint8)) + elif isinstance(tex, torch.Tensor): + tex = tex.cpu().numpy() + tex = Image.fromarray((tex * 255).astype(np.uint8)) + + tex = tex.resize(self.texture_size).convert('RGB') + tex = np.array(tex) / 255.0 + self.tex = torch.from_numpy(tex).to(self.device) + self.tex = self.tex.float() + + def set_default_render_resolution(self, default_resolution): + if isinstance(default_resolution, int): + default_resolution = (default_resolution, default_resolution) + self.default_resolution = default_resolution + + def set_default_texture_resolution(self, texture_size): + if isinstance(texture_size, int): + texture_size = (texture_size, texture_size) + self.texture_size = texture_size + + def get_mesh(self): + vtx_pos = self.vtx_pos.cpu().numpy() + pos_idx = self.pos_idx.cpu().numpy() + vtx_uv = self.vtx_uv.cpu().numpy() + uv_idx = self.uv_idx.cpu().numpy() + + # 坐标变换的逆变换 + vtx_pos[:, [1, 2]] = vtx_pos[:, [2, 1]] + vtx_pos[:, [0, 1]] = -vtx_pos[:, [0, 1]] + + vtx_uv[:, 1] = 1.0 - vtx_uv[:, 1] + return vtx_pos, pos_idx, vtx_uv, uv_idx + + def get_texture(self): + return self.tex.cpu().numpy() + + def to(self, device): + self.device = device + + for attr_name in dir(self): + attr_value = getattr(self, attr_name) + if isinstance(attr_value, torch.Tensor): + setattr(self, attr_name, attr_value.to(self.device)) + + def color_rgb_to_srgb(self, image): + if isinstance(image, Image.Image): + image_rgb = torch.tesnor( + np.array(image) / + 255.0).float().to( + self.device) + elif isinstance(image, np.ndarray): + image_rgb = torch.tensor(image).float() + else: + image_rgb = image.to(self.device) + + image_srgb = torch.where( + image_rgb <= 0.0031308, + 12.92 * image_rgb, + 1.055 * torch.pow(image_rgb, 1 / 2.4) - 0.055 + ) + + if isinstance(image, Image.Image): + image_srgb = Image.fromarray( + (image_srgb.cpu().numpy() * + 255).astype( + np.uint8)) + elif isinstance(image, np.ndarray): + image_srgb = image_srgb.cpu().numpy() + else: + image_srgb = image_srgb.to(image.device) + + return image_srgb + + def _render( + self, + glctx, + mvp, + pos, + pos_idx, + uv, + uv_idx, + tex, + resolution, + max_mip_level, + keep_alpha, + filter_mode + ): + pos_clip = transform_pos(mvp, pos) + if isinstance(resolution, (int, float)): + resolution = [resolution, resolution] + rast_out, rast_out_db = self.raster_rasterize( + glctx, pos_clip, pos_idx, resolution=resolution) + + tex = tex.contiguous() + if filter_mode == 'linear-mipmap-linear': + texc, texd = self.raster_interpolate( + uv[None, ...], rast_out, uv_idx, rast_db=rast_out_db, diff_attrs='all') + color = self.raster_texture( + tex[None, ...], texc, texd, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level) + else: + texc, _ = self.raster_interpolate(uv[None, ...], rast_out, uv_idx) + color = self.raster_texture(tex[None, ...], texc, filter_mode=filter_mode) + + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1) + color = color * visible_mask # Mask out background. + if self.use_antialias: + color = self.raster_antialias(color, rast_out, pos_clip, pos_idx) + + if keep_alpha: + color = torch.cat([color, visible_mask], dim=-1) + return color[0, ...] + + def render( + self, + elev, + azim, + camera_distance=None, + center=None, + resolution=None, + tex=None, + keep_alpha=True, + bgcolor=None, + filter_mode=None, + return_type='th' + ): + + proj = self.camera_proj_mat + r_mv = get_mv_matrix( + elev=elev, + azim=azim, + camera_distance=self.camera_distance if camera_distance is None else camera_distance, + center=center) + r_mvp = np.matmul(proj, r_mv).astype(np.float32) + if tex is not None: + if isinstance(tex, Image.Image): + tex = torch.tensor(np.array(tex) / 255.0) + elif isinstance(tex, np.ndarray): + tex = torch.tensor(tex) + if tex.dim() == 2: + tex = tex.unsqueeze(-1) + tex = tex.float().to(self.device) + image = self._render(r_mvp, self.vtx_pos, self.pos_idx, self.vtx_uv, self.uv_idx, + self.tex if tex is None else tex, + self.default_resolution if resolution is None else resolution, + self.max_mip_level, True, filter_mode if filter_mode else self.filter_mode) + mask = (image[..., [-1]] == 1).float() + if bgcolor is None: + bgcolor = [0 for _ in range(image.shape[-1] - 1)] + image = image * mask + (1 - mask) * \ + torch.tensor(bgcolor + [0]).to(self.device) + if keep_alpha == False: + image = image[..., :-1] + if return_type == 'np': + image = image.cpu().numpy() + elif return_type == 'pl': + image = image.squeeze(-1).cpu().numpy() * 255 + image = Image.fromarray(image.astype(np.uint8)) + return image + + def render_normal( + self, + elev, + azim, + camera_distance=None, + center=None, + resolution=None, + bg_color=[1, 1, 1], + use_abs_coor=False, + normalize_rgb=True, + return_type='th' + ): + + pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center) + if resolution is None: + resolution = self.default_resolution + if isinstance(resolution, (int, float)): + resolution = [resolution, resolution] + rast_out, rast_out_db = self.raster_rasterize( + pos_clip, self.pos_idx, resolution=resolution) + + if use_abs_coor: + mesh_triangles = self.vtx_pos[self.pos_idx[:, :3], :] + else: + pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4] + mesh_triangles = pos_camera[self.pos_idx[:, :3], :] + face_normals = F.normalize( + torch.cross(mesh_triangles[:, + 1, + :] - mesh_triangles[:, + 0, + :], + mesh_triangles[:, + 2, + :] - mesh_triangles[:, + 0, + :], + dim=-1), + dim=-1) + + vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0], + faces=self.pos_idx.cpu(), + face_normals=face_normals.cpu(), ) + vertex_normals = torch.from_numpy( + vertex_normals).float().to(self.device).contiguous() + + # Interpolate normal values across the rasterized pixels + normal, _ = self.raster_interpolate( + vertex_normals[None, ...], rast_out, self.pos_idx) + + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1) + normal = normal * visible_mask + \ + torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 - + visible_mask) + + if normalize_rgb: + normal = (normal + 1) * 0.5 + if self.use_antialias: + normal = self.raster_antialias(normal, rast_out, pos_clip, self.pos_idx) + + image = normal[0, ...] + if return_type == 'np': + image = image.cpu().numpy() + elif return_type == 'pl': + image = image.cpu().numpy() * 255 + image = Image.fromarray(image.astype(np.uint8)) + + return image + + def convert_normal_map(self, image): + # blue is front, red is left, green is top + if isinstance(image, Image.Image): + image = np.array(image) + mask = (image == [255, 255, 255]).all(axis=-1) + + image = (image / 255.0) * 2.0 - 1.0 + + image[..., [1]] = -image[..., [1]] + image[..., [1, 2]] = image[..., [2, 1]] + image[..., [0]] = -image[..., [0]] + + image = (image + 1.0) * 0.5 + + image = (image * 255).astype(np.uint8) + image[mask] = [127, 127, 255] + + return Image.fromarray(image) + + def get_pos_from_mvp(self, elev, azim, camera_distance, center): + proj = self.camera_proj_mat + r_mv = get_mv_matrix( + elev=elev, + azim=azim, + camera_distance=self.camera_distance if camera_distance is None else camera_distance, + center=center) + + pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True) + pos_clip = transform_pos(proj, pos_camera) + + return pos_camera, pos_clip + + def render_depth( + self, + elev, + azim, + camera_distance=None, + center=None, + resolution=None, + return_type='th' + ): + pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center) + + if resolution is None: + resolution = self.default_resolution + if isinstance(resolution, (int, float)): + resolution = [resolution, resolution] + rast_out, rast_out_db = self.raster_rasterize( + pos_clip, self.pos_idx, resolution=resolution) + + pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4] + tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous() + + # Interpolate depth values across the rasterized pixels + depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx) + + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1) + depth_max, depth_min = depth[visible_mask > + 0].max(), depth[visible_mask > 0].min() + depth = (depth - depth_min) / (depth_max - depth_min) + + depth = depth * visible_mask # Mask out background. + if self.use_antialias: + depth = self.raster_antialias(depth, rast_out, pos_clip, self.pos_idx) + + image = depth[0, ...] + if return_type == 'np': + image = image.cpu().numpy() + elif return_type == 'pl': + image = image.squeeze(-1).cpu().numpy() * 255 + image = Image.fromarray(image.astype(np.uint8)) + return image + + def render_position(self, elev, azim, camera_distance=None, center=None, + resolution=None, bg_color=[1, 1, 1], return_type='th'): + pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center) + if resolution is None: + resolution = self.default_resolution + if isinstance(resolution, (int, float)): + resolution = [resolution, resolution] + rast_out, rast_out_db = self.raster_rasterize( + pos_clip, self.pos_idx, resolution=resolution) + + tex_position = 0.5 - self.vtx_pos[:, :3] / self.scale_factor + tex_position = tex_position.contiguous() + + # Interpolate depth values across the rasterized pixels + position, _ = self.raster_interpolate( + tex_position[None, ...], rast_out, self.pos_idx) + + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1) + + position = position * visible_mask + \ + torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 - + visible_mask) + if self.use_antialias: + position = self.raster_antialias(position, rast_out, pos_clip, self.pos_idx) + + image = position[0, ...] + + if return_type == 'np': + image = image.cpu().numpy() + elif return_type == 'pl': + image = image.squeeze(-1).cpu().numpy() * 255 + image = Image.fromarray(image.astype(np.uint8)) + return image + + def render_uvpos(self, return_type='th'): + image = self.uv_feature_map(self.vtx_pos * 0.5 + 0.5) + if return_type == 'np': + image = image.cpu().numpy() + elif return_type == 'pl': + image = image.cpu().numpy() * 255 + image = Image.fromarray(image.astype(np.uint8)) + return image + + def uv_feature_map(self, vert_feat, bg=None): + vtx_uv = self.vtx_uv * 2 - 1.0 + vtx_uv = torch.cat( + [vtx_uv, torch.zeros_like(self.vtx_uv)], dim=1).unsqueeze(0) + vtx_uv[..., -1] = 1 + uv_idx = self.uv_idx + rast_out, rast_out_db = self.raster_rasterize( + vtx_uv, uv_idx, resolution=self.texture_size) + feat_map, _ = self.raster_interpolate(vert_feat[None, ...], rast_out, uv_idx) + feat_map = feat_map[0, ...] + if bg is not None: + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...] + feat_map[visible_mask == 0] = bg + return feat_map + + def render_sketch_from_geometry(self, normal_image, depth_image): + normal_image_np = normal_image.cpu().numpy() + depth_image_np = depth_image.cpu().numpy() + + normal_image_np = (normal_image_np * 255).astype(np.uint8) + depth_image_np = (depth_image_np * 255).astype(np.uint8) + normal_image_np = cv2.cvtColor(normal_image_np, cv2.COLOR_RGB2GRAY) + + normal_edges = cv2.Canny(normal_image_np, 80, 150) + depth_edges = cv2.Canny(depth_image_np, 30, 80) + + combined_edges = np.maximum(normal_edges, depth_edges) + + sketch_image = torch.from_numpy(combined_edges).to( + normal_image.device).float() / 255.0 + sketch_image = sketch_image.unsqueeze(-1) + + return sketch_image + + def render_sketch_from_depth(self, depth_image): + depth_image_np = depth_image.cpu().numpy() + depth_image_np = (depth_image_np * 255).astype(np.uint8) + depth_edges = cv2.Canny(depth_image_np, 30, 80) + combined_edges = depth_edges + sketch_image = torch.from_numpy(combined_edges).to( + depth_image.device).float() / 255.0 + sketch_image = sketch_image.unsqueeze(-1) + return sketch_image + + def back_project(self, image, elev, azim, + camera_distance=None, center=None, method=None): + if isinstance(image, Image.Image): + image = torch.tensor(np.array(image) / 255.0) + elif isinstance(image, np.ndarray): + image = torch.tensor(image) + if image.dim() == 2: + image = image.unsqueeze(-1) + image = image.float().to(self.device) + resolution = image.shape[:2] + channel = image.shape[-1] + texture = torch.zeros(self.texture_size + (channel,)).to(self.device) + cos_map = torch.zeros(self.texture_size + (1,)).to(self.device) + + proj = self.camera_proj_mat + r_mv = get_mv_matrix( + elev=elev, + azim=azim, + camera_distance=self.camera_distance if camera_distance is None else camera_distance, + center=center) + pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True) + pos_clip = transform_pos(proj, pos_camera) + pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4] + v0 = pos_camera[self.pos_idx[:, 0], :] + v1 = pos_camera[self.pos_idx[:, 1], :] + v2 = pos_camera[self.pos_idx[:, 2], :] + face_normals = F.normalize( + torch.cross( + v1 - v0, + v2 - v0, + dim=-1), + dim=-1) + vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0], + faces=self.pos_idx.cpu(), + face_normals=face_normals.cpu(), ) + vertex_normals = torch.from_numpy( + vertex_normals).float().to(self.device).contiguous() + tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous() + rast_out, rast_out_db = self.raster_rasterize( + pos_clip, self.pos_idx, resolution=resolution) + visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...] + + normal, _ = self.raster_interpolate( + vertex_normals[None, ...], rast_out, self.pos_idx) + normal = normal[0, ...] + uv, _ = self.raster_interpolate(self.vtx_uv[None, ...], rast_out, self.uv_idx) + depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx) + depth = depth[0, ...] + + depth_max, depth_min = depth[visible_mask > + 0].max(), depth[visible_mask > 0].min() + depth_normalized = (depth - depth_min) / (depth_max - depth_min) + depth_image = depth_normalized * visible_mask # Mask out background. + + sketch_image = self.render_sketch_from_depth(depth_image) + + lookat = torch.tensor([[0, 0, -1]], device=self.device) + cos_image = torch.nn.functional.cosine_similarity( + lookat, normal.view(-1, 3)) + cos_image = cos_image.view(normal.shape[0], normal.shape[1], 1) + + cos_thres = np.cos(self.bake_angle_thres / 180 * np.pi) + cos_image[cos_image < cos_thres] = 0 + + # shrink + kernel_size = self.bake_unreliable_kernel_size * 2 + 1 + kernel = torch.ones( + (1, 1, kernel_size, kernel_size), dtype=torch.float32).to( + sketch_image.device) + + visible_mask = visible_mask.permute(2, 0, 1).unsqueeze(0).float() + visible_mask = F.conv2d( + 1.0 - visible_mask, + kernel, + padding=kernel_size // 2) + visible_mask = 1.0 - (visible_mask > 0).float() # 二值化 + visible_mask = visible_mask.squeeze(0).permute(1, 2, 0) + + sketch_image = sketch_image.permute(2, 0, 1).unsqueeze(0) + sketch_image = F.conv2d(sketch_image, kernel, padding=kernel_size // 2) + sketch_image = (sketch_image > 0).float() # 二值化 + sketch_image = sketch_image.squeeze(0).permute(1, 2, 0) + visible_mask = visible_mask * (sketch_image < 0.5) + + cos_image[visible_mask == 0] = 0 + + method = self.bake_mode if method is None else method + + if method == 'linear': + proj_mask = (visible_mask != 0).view(-1) + uv = uv.squeeze(0).contiguous().view(-1, 2)[proj_mask] + image = image.squeeze(0).contiguous().view(-1, channel)[proj_mask] + cos_image = cos_image.contiguous().view(-1, 1)[proj_mask] + sketch_image = sketch_image.contiguous().view(-1, 1)[proj_mask] + + texture = linear_grid_put_2d( + self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], image) + cos_map = linear_grid_put_2d( + self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], cos_image) + boundary_map = linear_grid_put_2d( + self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], sketch_image) + else: + raise f'No bake mode {method}' + + return texture, cos_map, boundary_map + + def bake_texture(self, colors, elevs, azims, + camera_distance=None, center=None, exp=6, weights=None): + for i in range(len(colors)): + if isinstance(colors[i], Image.Image): + colors[i] = torch.tensor( + np.array( + colors[i]) / 255.0, + device=self.device).float() + if weights is None: + weights = [1.0 for _ in range(colors)] + textures = [] + cos_maps = [] + for color, elev, azim, weight in zip(colors, elevs, azims, weights): + texture, cos_map, _ = self.back_project( + color, elev, azim, camera_distance, center) + cos_map = weight * (cos_map ** exp) + textures.append(texture) + cos_maps.append(cos_map) + + texture_merge, trust_map_merge = self.fast_bake_texture( + textures, cos_maps) + return texture_merge, trust_map_merge + + @torch.no_grad() + def fast_bake_texture(self, textures, cos_maps): + + channel = textures[0].shape[-1] + texture_merge = torch.zeros( + self.texture_size + (channel,)).to(self.device) + trust_map_merge = torch.zeros(self.texture_size + (1,)).to(self.device) + for texture, cos_map in zip(textures, cos_maps): + view_sum = (cos_map > 0).sum() + painted_sum = ((cos_map > 0) * (trust_map_merge > 0)).sum() + if painted_sum / view_sum > 0.99: + continue + texture_merge += texture * cos_map + trust_map_merge += cos_map + texture_merge = texture_merge / torch.clamp(trust_map_merge, min=1E-8) + + return texture_merge, trust_map_merge > 1E-8 + + def uv_inpaint(self, texture, mask): + + if isinstance(texture, torch.Tensor): + texture_np = texture.cpu().numpy() + elif isinstance(texture, np.ndarray): + texture_np = texture + elif isinstance(texture, Image.Image): + texture_np = np.array(texture) / 255.0 + + vtx_pos, pos_idx, vtx_uv, uv_idx = self.get_mesh() + + texture_np, mask = meshVerticeInpaint( + texture_np, mask, vtx_pos, vtx_uv, pos_idx, uv_idx) + + texture_np = cv2.inpaint( + (texture_np * + 255).astype( + np.uint8), + 255 - + mask, + 3, + cv2.INPAINT_NS) + + return texture_np diff --git a/hy3dgen/texgen/differentiable_renderer/mesh_utils.py b/hy3dgen/texgen/differentiable_renderer/mesh_utils.py new file mode 100644 index 0000000..fa5694a --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/mesh_utils.py @@ -0,0 +1,34 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import trimesh + + +def load_mesh(mesh): + vtx_pos = mesh.vertices if hasattr(mesh, 'vertices') else None + pos_idx = mesh.faces if hasattr(mesh, 'faces') else None + + vtx_uv = mesh.visual.uv if hasattr(mesh.visual, 'uv') else None + uv_idx = mesh.faces if hasattr(mesh, 'faces') else None + + texture_data = None + + return vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data + + +def save_mesh(mesh, texture_data): + material = trimesh.visual.texture.SimpleMaterial(image=texture_data, diffuse=(255, 255, 255)) + texture_visuals = trimesh.visual.TextureVisuals(uv=mesh.visual.uv, image=texture_data, material=material) + mesh.visual = texture_visuals + return mesh diff --git a/hy3dgen/texgen/differentiable_renderer/setup.py b/hy3dgen/texgen/differentiable_renderer/setup.py new file mode 100644 index 0000000..1bfdb10 --- /dev/null +++ b/hy3dgen/texgen/differentiable_renderer/setup.py @@ -0,0 +1,62 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from setuptools import setup, Extension +import pybind11 +import sys +import platform + +def get_platform_specific_args(): + system = platform.system().lower() + cpp_std = 'c++14' # Make configurable if needed + + if sys.platform == 'win32': + compile_args = ['/O2', f'/std:{cpp_std}', '/EHsc', '/MP', '/DWIN32_LEAN_AND_MEAN', '/bigobj'] + link_args = [] + extra_includes = [] + elif system == 'linux': + compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra', '-pthread'] + link_args = ['-fPIC', '-pthread'] + extra_includes = [] + elif sys.platform == 'darwin': + compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra', + '-stdlib=libc++', '-mmacosx-version-min=10.14'] + link_args = ['-fPIC', '-stdlib=libc++', '-mmacosx-version-min=10.14', '-dynamiclib'] + extra_includes = [] + else: + raise RuntimeError(f"Unsupported platform: {system}") + + return compile_args, link_args, extra_includes + +extra_compile_args, extra_link_args, platform_includes = get_platform_specific_args() +include_dirs = [pybind11.get_include(), pybind11.get_include(user=True)] +include_dirs.extend(platform_includes) + +ext_modules = [ + Extension( + "mesh_processor", + ["mesh_processor.cpp"], + include_dirs=include_dirs, + language='c++', + extra_compile_args=extra_compile_args, + extra_link_args=extra_link_args, + ), +] + +setup( + name="mesh_processor", + ext_modules=ext_modules, + install_requires=['pybind11>=2.6.0'], + python_requires='>=3.6', +) \ No newline at end of file diff --git a/hy3dgen/texgen/hunyuanpaint/__init__.py b/hy3dgen/texgen/hunyuanpaint/__init__.py new file mode 100644 index 0000000..8bb2bf8 --- /dev/null +++ b/hy3dgen/texgen/hunyuanpaint/__init__.py @@ -0,0 +1,13 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. \ No newline at end of file diff --git a/hy3dgen/texgen/hunyuanpaint/pipeline.py b/hy3dgen/texgen/hunyuanpaint/pipeline.py new file mode 100644 index 0000000..38f3777 --- /dev/null +++ b/hy3dgen/texgen/hunyuanpaint/pipeline.py @@ -0,0 +1,722 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy +import numpy as np +import torch +import torch.distributed +import torch.utils.checkpoint +import transformers +from PIL import Image +import diffusers +from diffusers import ( + AutoencoderKL, + DDPMScheduler, + DiffusionPipeline, + EulerAncestralDiscreteScheduler, + UNet2DConditionModel, + ImagePipelineOutput +) +from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback +from diffusers.image_processor import PipelineImageInput +from diffusers.image_processor import VaeImageProcessor +from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline, \ + retrieve_timesteps, rescale_noise_cfg +from diffusers.schedulers import KarrasDiffusionSchedulers, LCMScheduler +from diffusers.utils import deprecate +from einops import rearrange +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection + +from .unet.modules import UNet2p5DConditionModel, \ + compute_multi_resolution_mask, compute_multi_resolution_discrete_voxel_indice + +def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32): + """ + See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 + + Args: + timesteps (`torch.Tensor`): + generate embedding vectors at these timesteps + embedding_dim (`int`, *optional*, defaults to 512): + dimension of the embeddings to generate + dtype: + data type of the generated embeddings + + Returns: + `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + """ + assert len(w.shape) == 1 + w = w * 1000.0 + + half_dim = embedding_dim // 2 + emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) + emb = w.to(dtype)[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0, 1)) + assert emb.shape == (w.shape[0], embedding_dim) + return emb + + +def append_dims(x, target_dims): + """Appends dimensions to the end of a tensor until it has target_dims dimensions.""" + dims_to_append = target_dims - x.ndim + if dims_to_append < 0: + raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less") + return x[(...,) + (None,) * dims_to_append] + + +# From LCMScheduler.get_scalings_for_boundary_condition_discrete +def scalings_for_boundary_conditions(timestep, sigma_data=0.5, timestep_scaling=10.0): + scaled_timestep = timestep_scaling * timestep + c_skip = sigma_data ** 2 / (scaled_timestep ** 2 + sigma_data ** 2) + c_out = scaled_timestep / (scaled_timestep ** 2 + sigma_data ** 2) ** 0.5 + return c_skip, c_out + + +# Compare LCMScheduler.step, Step 4 +def get_predicted_original_sample(model_output, timesteps, sample, prediction_type, alphas, sigmas, N_gen): + alphas = extract_into_tensor(alphas, timesteps, sample.shape, N_gen) + sigmas = extract_into_tensor(sigmas, timesteps, sample.shape, N_gen) + model_output = rearrange(model_output, '(b n) c h w -> b n c h w', n=N_gen) + if prediction_type == "epsilon": + pred_x_0 = (sample - sigmas * model_output) / alphas + elif prediction_type == "sample": + pred_x_0 = model_output + elif prediction_type == "v_prediction": + pred_x_0 = alphas * sample - sigmas * model_output + else: + raise ValueError( + f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`" + f" are supported." + ) + + return pred_x_0 + + +# Based on step 4 in DDIMScheduler.step +def get_predicted_noise(model_output, timesteps, sample, prediction_type, alphas, sigmas, N_gen): + alphas = extract_into_tensor(alphas, timesteps, sample.shape, N_gen) + sigmas = extract_into_tensor(sigmas, timesteps, sample.shape, N_gen) + model_output = rearrange(model_output, '(b n) c h w -> b n c h w', n=N_gen) + if prediction_type == "epsilon": + pred_epsilon = model_output + elif prediction_type == "sample": + pred_epsilon = (sample - alphas * model_output) / sigmas + elif prediction_type == "v_prediction": + pred_epsilon = alphas * model_output + sigmas * sample + else: + raise ValueError( + f"Prediction type {prediction_type} is not supported; currently, `epsilon`, `sample`, and `v_prediction`" + f" are supported." + ) + + return pred_epsilon + +def extract_into_tensor(a, t, x_shape, N_gen): + # b, *_ = t.shape + out = a.gather(-1, t) + out = out.repeat(N_gen) + out = rearrange(out, '(b n) -> b n', n=N_gen) + b, c, *_ = out.shape + return out.reshape(b, c, *((1,) * (len(x_shape) - 2))) + +class DDIMSolver: + def __init__(self, alpha_cumprods, timesteps=1000, ddim_timesteps=50): + # DDIM sampling parameters + step_ratio = timesteps // ddim_timesteps + self.ddim_timesteps = (np.arange(1, ddim_timesteps + 1) * step_ratio).round().astype(np.int64) - 1 + self.ddim_alpha_cumprods = alpha_cumprods[self.ddim_timesteps] + self.ddim_alpha_cumprods_prev = np.asarray( + [alpha_cumprods[0]] + alpha_cumprods[self.ddim_timesteps[:-1]].tolist() + ) + # convert to torch tensors + self.ddim_timesteps = torch.from_numpy(self.ddim_timesteps).long() + self.ddim_alpha_cumprods = torch.from_numpy(self.ddim_alpha_cumprods) + self.ddim_alpha_cumprods_prev = torch.from_numpy(self.ddim_alpha_cumprods_prev) + + def to(self, device): + self.ddim_timesteps = self.ddim_timesteps.to(device) + self.ddim_alpha_cumprods = self.ddim_alpha_cumprods.to(device) + self.ddim_alpha_cumprods_prev = self.ddim_alpha_cumprods_prev.to(device) + return self + + def ddim_step(self, pred_x0, pred_noise, timestep_index, N_gen): + alpha_cumprod_prev = extract_into_tensor(self.ddim_alpha_cumprods_prev, timestep_index, pred_x0.shape, N_gen) + dir_xt = (1.0 - alpha_cumprod_prev).sqrt() * pred_noise + x_prev = alpha_cumprod_prev.sqrt() * pred_x0 + dir_xt + return x_prev + + +@torch.no_grad() +def update_ema(target_params, source_params, rate=0.99): + """ + Update target parameters to be closer to those of source parameters using + an exponential moving average. + + :param target_params: the target parameter sequence. + :param source_params: the source parameter sequence. + :param rate: the EMA rate (closer to 1 means slower). + """ + + for targ, src in zip(target_params, source_params): + targ.detach().mul_(rate).add_(src, alpha=1 - rate) + +def to_rgb_image(maybe_rgba: Image.Image): + if maybe_rgba.mode == 'RGB': + return maybe_rgba + elif maybe_rgba.mode == 'RGBA': + rgba = maybe_rgba + img = numpy.random.randint(127, 128, size=[rgba.size[1], rgba.size[0], 3], dtype=numpy.uint8) + img = Image.fromarray(img, 'RGB') + img.paste(rgba, mask=rgba.getchannel('A')) + return img + else: + raise ValueError("Unsupported image type.", maybe_rgba.mode) + + +class HunyuanPaintPipeline(StableDiffusionPipeline): + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2p5DConditionModel, + scheduler: KarrasDiffusionSchedulers, + feature_extractor: CLIPImageProcessor, + safety_checker=None, + use_torch_compile=False, + ): + DiffusionPipeline.__init__(self) + + safety_checker = None + self.register_modules( + vae=torch.compile(vae) if use_torch_compile else vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=torch.compile(feature_extractor) if use_torch_compile else feature_extractor, + ) + self.solver = DDIMSolver( + scheduler.alphas_cumprod.numpy(), + timesteps=scheduler.config.num_train_timesteps, + ddim_timesteps=30, + ).to('cuda') + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.is_turbo = False + + def set_turbo(self, is_turbo: bool): + self.is_turbo = is_turbo + + @torch.no_grad() + def encode_images(self, images): + B = images.shape[0] + images = rearrange(images, 'b n c h w -> (b n) c h w') + + dtype = next(self.vae.parameters()).dtype + images = (images - 0.5) * 2.0 + posterior = self.vae.encode(images.to(dtype)).latent_dist + latents = posterior.sample() * self.vae.config.scaling_factor + + latents = rearrange(latents, '(b n) c h w -> b n c h w', b=B) + return latents + + @torch.no_grad() + def __call__( + self, + image: Image.Image = None, + prompt=None, + negative_prompt='watermark, ugly, deformed, noisy, blurry, low contrast', + *args, + num_images_per_prompt: Optional[int] = 1, + guidance_scale=2.0, + output_type: Optional[str] = "pil", + width=512, + height=512, + num_inference_steps=28, + return_dict=True, + **cached_condition, + ): + device = self._execution_device + + if image is None: + raise ValueError("Inputting embeddings not supported for this pipeline. Please pass an image.") + assert not isinstance(image, torch.Tensor) + + if not isinstance(image, List): + image = [image] + + image = [to_rgb_image(img) for img in image] + + image_vae = [torch.tensor(np.array(img) / 255.0) for img in image] + image_vae = [img_vae.unsqueeze(0).permute(0, 3, 1, 2).unsqueeze(0) for img_vae in image_vae] + image_vae = torch.cat(image_vae, dim=1) + image_vae = image_vae.to(device=device, dtype=self.vae.dtype) + + batch_size, N_ref = image_vae.shape[0], image_vae.shape[1] + assert batch_size == 1 + assert num_images_per_prompt == 1 + + ref_latents = self.encode_images(image_vae) + + def convert_pil_list_to_tensor(images): + bg_c = [1., 1., 1.] + images_tensor = [] + for batch_imgs in images: + view_imgs = [] + for pil_img in batch_imgs: + img = numpy.asarray(pil_img, dtype=numpy.float32) / 255. + if img.shape[2] > 3: + alpha = img[:, :, 3:] + img = img[:, :, :3] * alpha + bg_c * (1 - alpha) + img = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).contiguous().half().to("cuda") + view_imgs.append(img) + view_imgs = torch.cat(view_imgs, dim=0) + images_tensor.append(view_imgs.unsqueeze(0)) + + images_tensor = torch.cat(images_tensor, dim=0) + return images_tensor + + if "normal_imgs" in cached_condition: + + if isinstance(cached_condition["normal_imgs"], List): + cached_condition["normal_imgs"] = convert_pil_list_to_tensor(cached_condition["normal_imgs"]) + + cached_condition['normal_imgs'] = self.encode_images(cached_condition["normal_imgs"]) + + if "position_imgs" in cached_condition: + + if isinstance(cached_condition["position_imgs"], List): + cached_condition["position_imgs"] = convert_pil_list_to_tensor(cached_condition["position_imgs"]) + + cached_condition['position_maps'] = cached_condition['position_imgs'] + cached_condition["position_imgs"] = self.encode_images(cached_condition["position_imgs"]) + + if 'camera_info_gen' in cached_condition: + camera_info = cached_condition['camera_info_gen'] # B,N + if isinstance(camera_info, List): + camera_info = torch.tensor(camera_info) + camera_info = camera_info.to(device).to(torch.int64) + cached_condition['camera_info_gen'] = camera_info + if 'camera_info_ref' in cached_condition: + camera_info = cached_condition['camera_info_ref'] # B,N + if isinstance(camera_info, List): + camera_info = torch.tensor(camera_info) + camera_info = camera_info.to(device).to(torch.int64) + cached_condition['camera_info_ref'] = camera_info + + cached_condition['ref_latents'] = ref_latents + + if self.is_turbo: + if 'position_maps' in cached_condition: + cached_condition['position_attn_mask'] = ( + compute_multi_resolution_mask(cached_condition['position_maps']) + ) + cached_condition['position_voxel_indices'] = ( + compute_multi_resolution_discrete_voxel_indice(cached_condition['position_maps']) + ) + + if (guidance_scale > 1) and (not self.is_turbo): + negative_ref_latents = torch.zeros_like(cached_condition['ref_latents']) + cached_condition['ref_latents'] = torch.cat([negative_ref_latents, cached_condition['ref_latents']]) + cached_condition['ref_scale'] = torch.as_tensor([0.0, 1.0]).to(cached_condition['ref_latents']) + if "normal_imgs" in cached_condition: + cached_condition['normal_imgs'] = torch.cat( + (cached_condition['normal_imgs'], cached_condition['normal_imgs'])) + + if "position_imgs" in cached_condition: + cached_condition['position_imgs'] = torch.cat( + (cached_condition['position_imgs'], cached_condition['position_imgs'])) + + if 'position_maps' in cached_condition: + cached_condition['position_maps'] = torch.cat( + (cached_condition['position_maps'], cached_condition['position_maps'])) + + if 'camera_info_gen' in cached_condition: + cached_condition['camera_info_gen'] = torch.cat( + (cached_condition['camera_info_gen'], cached_condition['camera_info_gen'])) + if 'camera_info_ref' in cached_condition: + cached_condition['camera_info_ref'] = torch.cat( + (cached_condition['camera_info_ref'], cached_condition['camera_info_ref'])) + + prompt_embeds = self.unet.learned_text_clip_gen.repeat(num_images_per_prompt, 1, 1) + negative_prompt_embeds = torch.zeros_like(prompt_embeds) + + latents: torch.Tensor = self.denoise( + None, + *args, + cross_attention_kwargs=None, + guidance_scale=guidance_scale, + num_images_per_prompt=num_images_per_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + num_inference_steps=num_inference_steps, + output_type='latent', + width=width, + height=height, + **cached_condition + ).images + + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + else: + image = latents + + image = self.image_processor.postprocess(image, output_type=output_type) + if not return_dict: + return (image,) + + return ImagePipelineOutput(images=image) + + def denoise( + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + sigmas: List[float] = None, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.Tensor] = None, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guidance_rescale: float = 0.0, + clip_skip: Optional[int] = None, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + **kwargs, + ): + r""" + The call function to the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. + guidance_scale (`float`, *optional*, defaults to 7.5): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.Tensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor is generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`] + (https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + guidance_rescale (`float`, *optional*, defaults to 0.0): + Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when + using zero terminal SNR. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*): + A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of + each denoising step during the inference. with the following arguments: `callback_on_step_end(self: + DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a + list of all tensors as specified by `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, + otherwise a `tuple` is returned where the first element is a list with the generated images and the + second element is a list of `bool`s indicating whether the corresponding generated image contains + "not-safe-for-work" (nsfw) content. + """ + + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + if callback is not None: + deprecate( + "callback", + "1.0.0", + "Passing `callback` as an input argument to `__call__` is deprecated,", + "consider using `callback_on_step_end`", + ) + if callback_steps is not None: + deprecate( + "callback_steps", + "1.0.0", + "Passing `callback_steps` as an input argument to `__call__` is deprecated,", + "consider using `callback_on_step_end`", + ) + + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + # to deal with lora scaling and other possible forward hooks + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + height, + width, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, + callback_on_step_end_tensor_inputs, + ) + + self._guidance_scale = guidance_scale + self._guidance_rescale = guidance_rescale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs + self._interrupt = False + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + # 3. Encode input prompt + lora_scale = ( + self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None + ) + + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + prompt, + device, + num_images_per_prompt, + self.do_classifier_free_guidance if self.is_turbo else False, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=lora_scale, + clip_skip=self.clip_skip, + ) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + if (self.do_classifier_free_guidance) and (not self.is_turbo): + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance if self.is_turbo else False, + ) + + # 4. Prepare + if self.is_turbo: + bsz = 3 + N_gen = 15 + index = torch.range(29, 0, -bsz, device='cuda').long() + timesteps = self.solver.ddim_timesteps[index] + self.scheduler.set_timesteps(timesteps=timesteps.cpu(), device='cuda') + else: + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps, sigmas + ) + + assert num_images_per_prompt == 1 + # 5. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * kwargs['num_in_batch'], # num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 6.1 Add image embeds for IP-Adapter + added_cond_kwargs = ( + {"image_embeds": image_embeds} + if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) + else None + ) + + # 6.2 Optionally get Guidance Scale Embedding + timestep_cond = None + if self.unet.config.time_cond_proj_dim is not None: + guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt) + timestep_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim + ).to(device=device, dtype=latents.dtype) + + # 7. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + self._num_timesteps = len(timesteps) + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + # expand the latents if we are doing classifier free guidance + latents = rearrange(latents, '(b n) c h w -> b n c h w', n=kwargs['num_in_batch']) + latent_model_input = ( + torch.cat([latents] * 2) + if ((self.do_classifier_free_guidance) and (not self.is_turbo)) + else latents + ) + latent_model_input = rearrange(latent_model_input, 'b n c h w -> (b n) c h w') + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + latent_model_input = rearrange(latent_model_input, '(b n) c h w ->b n c h w', n=kwargs['num_in_batch']) + + # predict the noise residual + + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + timestep_cond=timestep_cond, + cross_attention_kwargs=self.cross_attention_kwargs, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, **kwargs + )[0] + latents = rearrange(latents, 'b n c h w -> (b n) c h w') + # perform guidance + if (self.do_classifier_free_guidance) and (not self.is_turbo): + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + + if (self.do_classifier_free_guidance) and (self.guidance_rescale > 0.0) and (not self.is_turbo): + # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf + noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale) + + # compute the previous noisy sample x_t -> x_t-1 + latents = \ + self.scheduler.step(noise_pred, t, latents[:, :num_channels_latents, :, :], **extra_step_kwargs, + return_dict=False)[0] + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[ + 0 + ] + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + image = latents + has_nsfw_concept = None + + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + else: + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + + # Offload all models + self.maybe_free_model_hooks() + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/hy3dgen/texgen/hunyuanpaint/unet/__init__.py b/hy3dgen/texgen/hunyuanpaint/unet/__init__.py new file mode 100644 index 0000000..8bb2bf8 --- /dev/null +++ b/hy3dgen/texgen/hunyuanpaint/unet/__init__.py @@ -0,0 +1,13 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. \ No newline at end of file diff --git a/hy3dgen/texgen/hunyuanpaint/unet/modules.py b/hy3dgen/texgen/hunyuanpaint/unet/modules.py new file mode 100644 index 0000000..f558cd7 --- /dev/null +++ b/hy3dgen/texgen/hunyuanpaint/unet/modules.py @@ -0,0 +1,599 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import copy +import json +import os +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from diffusers.models import UNet2DConditionModel +from diffusers.models.attention_processor import Attention +from diffusers.models.transformers.transformer_2d import BasicTransformerBlock +from einops import rearrange + + +def _chunked_feed_forward(ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int): + # "feed_forward_chunk_size" can be used to save memory + if hidden_states.shape[chunk_dim] % chunk_size != 0: + raise ValueError( + f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]}" + f"has to be divisible by chunk size: {chunk_size}." + f" Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`." + ) + + num_chunks = hidden_states.shape[chunk_dim] // chunk_size + ff_output = torch.cat( + [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)], + dim=chunk_dim, + ) + return ff_output + + +class Basic2p5DTransformerBlock(torch.nn.Module): + def __init__(self, transformer: BasicTransformerBlock,layer_name,use_ma=True,use_ra=True,is_turbo=False) -> None: + super().__init__() + self.transformer = transformer + self.layer_name = layer_name + self.use_ma = use_ma + self.use_ra = use_ra + self.is_turbo = is_turbo + + # multiview attn + if self.use_ma: + self.attn_multiview = Attention( + query_dim=self.dim, + heads=self.num_attention_heads, + dim_head=self.attention_head_dim, + dropout=self.dropout, + bias=self.attention_bias, + cross_attention_dim=None, + upcast_attention=self.attn1.upcast_attention, + out_bias=True, + ) + + # ref attn + if self.use_ra: + self.attn_refview = Attention( + query_dim=self.dim, + heads=self.num_attention_heads, + dim_head=self.attention_head_dim, + dropout=self.dropout, + bias=self.attention_bias, + cross_attention_dim=None, + upcast_attention=self.attn1.upcast_attention, + out_bias=True, + ) + if self.is_turbo: + self._initialize_attn_weights() + + def _initialize_attn_weights(self): + + if self.use_ma: + self.attn_multiview.load_state_dict(self.attn1.state_dict()) + with torch.no_grad(): + for layer in self.attn_multiview.to_out: + for param in layer.parameters(): + param.zero_() + if self.use_ra: + self.attn_refview.load_state_dict(self.attn1.state_dict()) + with torch.no_grad(): + for layer in self.attn_refview.to_out: + for param in layer.parameters(): + param.zero_() + + def __getattr__(self, name: str): + try: + return super().__getattr__(name) + except AttributeError: + return getattr(self.transformer, name) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + timestep: Optional[torch.LongTensor] = None, + cross_attention_kwargs: Dict[str, Any] = None, + class_labels: Optional[torch.LongTensor] = None, + added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, + ) -> torch.Tensor: + + # Notice that normalization is always applied before the real computation in the following blocks. + # 0. Self-Attention + batch_size = hidden_states.shape[0] + + cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {} + num_in_batch = cross_attention_kwargs.pop('num_in_batch', 1) + mode = cross_attention_kwargs.pop('mode', None) + if not self.is_turbo: + mva_scale = cross_attention_kwargs.pop('mva_scale', 1.0) + ref_scale = cross_attention_kwargs.pop('ref_scale', 1.0) + else: + position_attn_mask = cross_attention_kwargs.pop("position_attn_mask", None) + position_voxel_indices = cross_attention_kwargs.pop("position_voxel_indices", None) + mva_scale = 1.0 + ref_scale = 1.0 + + condition_embed_dict = cross_attention_kwargs.pop("condition_embed_dict", None) + + if self.norm_type == "ada_norm": + norm_hidden_states = self.norm1(hidden_states, timestep) + elif self.norm_type == "ada_norm_zero": + norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1( + hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype + ) + elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]: + norm_hidden_states = self.norm1(hidden_states) + elif self.norm_type == "ada_norm_continuous": + norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"]) + elif self.norm_type == "ada_norm_single": + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( + self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1) + ).chunk(6, dim=1) + norm_hidden_states = self.norm1(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa + else: + raise ValueError("Incorrect norm used") + + if self.pos_embed is not None: + norm_hidden_states = self.pos_embed(norm_hidden_states) + + # 1. Prepare GLIGEN inputs + cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {} + gligen_kwargs = cross_attention_kwargs.pop("gligen", None) + + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + + if self.norm_type == "ada_norm_zero": + attn_output = gate_msa.unsqueeze(1) * attn_output + elif self.norm_type == "ada_norm_single": + attn_output = gate_msa * attn_output + + hidden_states = attn_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + # 1.2 Reference Attention + if 'w' in mode: + condition_embed_dict[self.layer_name] = rearrange( + norm_hidden_states, '(b n) l c -> b (n l) c', + n=num_in_batch + ) # B, (N L), C + + if 'r' in mode and self.use_ra: + condition_embed = condition_embed_dict[self.layer_name].unsqueeze(1).repeat(1, num_in_batch, 1, + 1) # B N L C + condition_embed = rearrange(condition_embed, 'b n l c -> (b n) l c') + + attn_output = self.attn_refview( + norm_hidden_states, + encoder_hidden_states=condition_embed, + attention_mask=None, + **cross_attention_kwargs + ) + if not self.is_turbo: + ref_scale_timing = ref_scale + if isinstance(ref_scale, torch.Tensor): + ref_scale_timing = ref_scale.unsqueeze(1).repeat(1, num_in_batch).view(-1) + for _ in range(attn_output.ndim - 1): + ref_scale_timing = ref_scale_timing.unsqueeze(-1) + + hidden_states = ref_scale_timing * attn_output + hidden_states + + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + # 1.3 Multiview Attention + if num_in_batch > 1 and self.use_ma: + multivew_hidden_states = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c', n=num_in_batch) + + if self.is_turbo: + position_mask = None + if position_attn_mask is not None: + if multivew_hidden_states.shape[1] in position_attn_mask: + position_mask = position_attn_mask[multivew_hidden_states.shape[1]] + position_indices = None + if position_voxel_indices is not None: + if multivew_hidden_states.shape[1] in position_voxel_indices: + position_indices = position_voxel_indices[multivew_hidden_states.shape[1]] + attn_output = self.attn_multiview( + multivew_hidden_states, + encoder_hidden_states=multivew_hidden_states, + attention_mask=position_mask, + position_indices=position_indices, + **cross_attention_kwargs + ) + else: + attn_output = self.attn_multiview( + multivew_hidden_states, + encoder_hidden_states=multivew_hidden_states, + **cross_attention_kwargs + ) + + attn_output = rearrange(attn_output, 'b (n l) c -> (b n) l c', n=num_in_batch) + + hidden_states = mva_scale * attn_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + # 1.2 GLIGEN Control + if gligen_kwargs is not None: + hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"]) + + # 3. Cross-Attention + if self.attn2 is not None: + if self.norm_type == "ada_norm": + norm_hidden_states = self.norm2(hidden_states, timestep) + elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]: + norm_hidden_states = self.norm2(hidden_states) + elif self.norm_type == "ada_norm_single": + # For PixArt norm2 isn't applied here: + # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103 + norm_hidden_states = hidden_states + elif self.norm_type == "ada_norm_continuous": + norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"]) + else: + raise ValueError("Incorrect norm") + + if self.pos_embed is not None and self.norm_type != "ada_norm_single": + norm_hidden_states = self.pos_embed(norm_hidden_states) + + attn_output = self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + **cross_attention_kwargs, + ) + + hidden_states = attn_output + hidden_states + + # 4. Feed-forward + # i2vgen doesn't have this norm 🤷‍♂️ + if self.norm_type == "ada_norm_continuous": + norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"]) + elif not self.norm_type == "ada_norm_single": + norm_hidden_states = self.norm3(hidden_states) + + if self.norm_type == "ada_norm_zero": + norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] + + if self.norm_type == "ada_norm_single": + norm_hidden_states = self.norm2(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp + + if self._chunk_size is not None: + # "feed_forward_chunk_size" can be used to save memory + ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size) + else: + ff_output = self.ff(norm_hidden_states) + + if self.norm_type == "ada_norm_zero": + ff_output = gate_mlp.unsqueeze(1) * ff_output + elif self.norm_type == "ada_norm_single": + ff_output = gate_mlp * ff_output + + hidden_states = ff_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + return hidden_states + +@torch.no_grad() +def compute_voxel_grid_mask(position, grid_resolution=8): + + position = position.half() + B,N,_,H,W = position.shape + assert H%grid_resolution==0 and W%grid_resolution==0 + + valid_mask = (position != 1).all(dim=2, keepdim=True) + valid_mask = valid_mask.expand_as(position) + position[valid_mask==False] = 0 + + + position = rearrange( + position, + 'b n c (num_h grid_h) (num_w grid_w) -> b n num_h num_w c grid_h grid_w', + num_h=grid_resolution, num_w=grid_resolution + ) + valid_mask = rearrange( + valid_mask, + 'b n c (num_h grid_h) (num_w grid_w) -> b n num_h num_w c grid_h grid_w', + num_h=grid_resolution, num_w=grid_resolution + ) + + grid_position = position.sum(dim=(-2, -1)) + count_masked = valid_mask.sum(dim=(-2, -1)) + + grid_position = grid_position / count_masked.clamp(min=1) + grid_position[count_masked<5] = 0 + + grid_position = grid_position.permute(0,1,4,2,3) + grid_position = rearrange(grid_position, 'b n c h w -> b n (h w) c') + + grid_position_expanded_1 = grid_position.unsqueeze(2).unsqueeze(4) # 形状变为 B, N, 1, L, 1, 3 + grid_position_expanded_2 = grid_position.unsqueeze(1).unsqueeze(3) # 形状变为 B, 1, N, 1, L, 3 + + # 计算欧氏距离 + distances = torch.norm(grid_position_expanded_1 - grid_position_expanded_2, dim=-1) # 形状为 B, N, N, L, L + + weights = distances + grid_distance = 1.73/grid_resolution + + #weights = weights*-32 + #weights = weights.clamp(min=-10000.0) + + weights = weights< grid_distance + + return weights + +def compute_multi_resolution_mask(position_maps, grid_resolutions=[32, 16, 8]): + position_attn_mask = {} + with torch.no_grad(): + for grid_resolution in grid_resolutions: + position_mask = compute_voxel_grid_mask(position_maps, grid_resolution) + position_mask = rearrange(position_mask, 'b ni nj li lj -> b (ni li) (nj lj)') + position_attn_mask[position_mask.shape[1]] = position_mask + return position_attn_mask + +@torch.no_grad() +def compute_discrete_voxel_indice(position, grid_resolution=8, voxel_resolution=128): + + position = position.half() + B,N,_,H,W = position.shape + assert H%grid_resolution==0 and W%grid_resolution==0 + + valid_mask = (position != 1).all(dim=2, keepdim=True) + valid_mask = valid_mask.expand_as(position) + position[valid_mask==False] = 0 + + position = rearrange( + position, + 'b n c (num_h grid_h) (num_w grid_w) -> b n num_h num_w c grid_h grid_w', + num_h=grid_resolution, num_w=grid_resolution + ) + valid_mask = rearrange( + valid_mask, + 'b n c (num_h grid_h) (num_w grid_w) -> b n num_h num_w c grid_h grid_w', + num_h=grid_resolution, num_w=grid_resolution + ) + + grid_position = position.sum(dim=(-2, -1)) + count_masked = valid_mask.sum(dim=(-2, -1)) + + grid_position = grid_position / count_masked.clamp(min=1) + grid_position[count_masked<5] = 0 + + grid_position = grid_position.permute(0,1,4,2,3).clamp(0, 1) # B N C H W + voxel_indices = grid_position * (voxel_resolution - 1) + voxel_indices = torch.round(voxel_indices).long() + return voxel_indices + +def compute_multi_resolution_discrete_voxel_indice( + position_maps, + grid_resolutions=[64, 32, 16, 8], + voxel_resolutions=[512, 256, 128, 64] +): + voxel_indices = {} + with torch.no_grad(): + for grid_resolution, voxel_resolution in zip(grid_resolutions, voxel_resolutions): + voxel_indice = compute_discrete_voxel_indice(position_maps, grid_resolution, voxel_resolution) + voxel_indice = rearrange(voxel_indice, 'b n c h w -> b (n h w) c') + voxel_indices[voxel_indice.shape[1]] = {'voxel_indices':voxel_indice, 'voxel_resolution':voxel_resolution} + return voxel_indices + +class UNet2p5DConditionModel(torch.nn.Module): + def __init__(self, unet: UNet2DConditionModel) -> None: + super().__init__() + self.unet = unet + + self.use_ma = True + self.use_ra = True + self.use_camera_embedding = True + self.use_dual_stream = True + self.is_turbo = False + + if self.use_dual_stream: + self.unet_dual = copy.deepcopy(unet) + self.init_attention(self.unet_dual) + self.init_attention(self.unet, use_ma=self.use_ma, use_ra=self.use_ra, is_turbo=self.is_turbo) + self.init_condition() + self.init_camera_embedding() + + @staticmethod + def from_pretrained(pretrained_model_name_or_path, **kwargs): + torch_dtype = kwargs.pop('torch_dtype', torch.float32) + config_path = os.path.join(pretrained_model_name_or_path, 'config.json') + unet_ckpt_path = os.path.join(pretrained_model_name_or_path, 'diffusion_pytorch_model.bin') + with open(config_path, 'r', encoding='utf-8') as file: + config = json.load(file) + unet = UNet2DConditionModel(**config) + unet = UNet2p5DConditionModel(unet) + unet_ckpt = torch.load(unet_ckpt_path, map_location='cpu', weights_only=True) + unet.load_state_dict(unet_ckpt, strict=True) + unet = unet.to(torch_dtype) + return unet + + def init_condition(self): + self.unet.conv_in = torch.nn.Conv2d( + 12, + self.unet.conv_in.out_channels, + kernel_size=self.unet.conv_in.kernel_size, + stride=self.unet.conv_in.stride, + padding=self.unet.conv_in.padding, + dilation=self.unet.conv_in.dilation, + groups=self.unet.conv_in.groups, + bias=self.unet.conv_in.bias is not None) + + self.unet.learned_text_clip_gen = nn.Parameter(torch.randn(1, 77, 1024)) + self.unet.learned_text_clip_ref = nn.Parameter(torch.randn(1, 77, 1024)) + + def init_camera_embedding(self): + + if self.use_camera_embedding: + time_embed_dim = 1280 + self.max_num_ref_image = 5 + self.max_num_gen_image = 12 * 3 + 4 * 2 + self.unet.class_embedding = nn.Embedding(self.max_num_ref_image + self.max_num_gen_image, time_embed_dim) + + def init_attention(self, unet, use_ma=False, use_ra=False, is_turbo=False): + + for down_block_i, down_block in enumerate(unet.down_blocks): + if hasattr(down_block, "has_cross_attention") and down_block.has_cross_attention: + for attn_i, attn in enumerate(down_block.attentions): + for transformer_i, transformer in enumerate(attn.transformer_blocks): + if isinstance(transformer, BasicTransformerBlock): + attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock( + transformer, + f'down_{down_block_i}_{attn_i}_{transformer_i}', + use_ma, use_ra, is_turbo + ) + + if hasattr(unet.mid_block, "has_cross_attention") and unet.mid_block.has_cross_attention: + for attn_i, attn in enumerate(unet.mid_block.attentions): + for transformer_i, transformer in enumerate(attn.transformer_blocks): + if isinstance(transformer, BasicTransformerBlock): + attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock( + transformer, + f'mid_{attn_i}_{transformer_i}', + use_ma, use_ra, is_turbo + ) + + for up_block_i, up_block in enumerate(unet.up_blocks): + if hasattr(up_block, "has_cross_attention") and up_block.has_cross_attention: + for attn_i, attn in enumerate(up_block.attentions): + for transformer_i, transformer in enumerate(attn.transformer_blocks): + if isinstance(transformer, BasicTransformerBlock): + attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock( + transformer, + f'up_{up_block_i}_{attn_i}_{transformer_i}', + use_ma, use_ra, is_turbo + ) + + def __getattr__(self, name: str): + try: + return super().__getattr__(name) + except AttributeError: + return getattr(self.unet, name) + + def forward( + self, sample, timestep, encoder_hidden_states, + *args, down_intrablock_additional_residuals=None, + down_block_res_samples=None, mid_block_res_sample=None, + **cached_condition, + ): + B, N_gen, _, H, W = sample.shape + assert H == W + + if self.use_camera_embedding: + camera_info_gen = cached_condition['camera_info_gen'] + self.max_num_ref_image + camera_info_gen = rearrange(camera_info_gen, 'b n -> (b n)') + else: + camera_info_gen = None + + sample = [sample] + if 'normal_imgs' in cached_condition: + sample.append(cached_condition["normal_imgs"]) + if 'position_imgs' in cached_condition: + sample.append(cached_condition["position_imgs"]) + sample = torch.cat(sample, dim=2) + + sample = rearrange(sample, 'b n c h w -> (b n) c h w') + + encoder_hidden_states_gen = encoder_hidden_states.unsqueeze(1).repeat(1, N_gen, 1, 1) + encoder_hidden_states_gen = rearrange(encoder_hidden_states_gen, 'b n l c -> (b n) l c') + + if self.use_ra: + if 'condition_embed_dict' in cached_condition: + condition_embed_dict = cached_condition['condition_embed_dict'] + else: + condition_embed_dict = {} + ref_latents = cached_condition['ref_latents'] + N_ref = ref_latents.shape[1] + if self.use_camera_embedding: + camera_info_ref = cached_condition['camera_info_ref'] + camera_info_ref = rearrange(camera_info_ref, 'b n -> (b n)') + else: + camera_info_ref = None + + ref_latents = rearrange(ref_latents, 'b n c h w -> (b n) c h w') + + encoder_hidden_states_ref = self.unet.learned_text_clip_ref.unsqueeze(1).repeat(B, N_ref, 1, 1) + encoder_hidden_states_ref = rearrange(encoder_hidden_states_ref, 'b n l c -> (b n) l c') + + noisy_ref_latents = ref_latents + timestep_ref = 0 + + if self.use_dual_stream: + unet_ref = self.unet_dual + else: + unet_ref = self.unet + unet_ref( + noisy_ref_latents, timestep_ref, + encoder_hidden_states=encoder_hidden_states_ref, + class_labels=camera_info_ref, + # **kwargs + return_dict=False, + cross_attention_kwargs={ + 'mode': 'w', 'num_in_batch': N_ref, + 'condition_embed_dict': condition_embed_dict}, + ) + cached_condition['condition_embed_dict'] = condition_embed_dict + else: + condition_embed_dict = None + + mva_scale = cached_condition.get('mva_scale', 1.0) + ref_scale = cached_condition.get('ref_scale', 1.0) + + if self.is_turbo: + cross_attention_kwargs_ = { + 'mode': 'r', 'num_in_batch': N_gen, + 'condition_embed_dict': condition_embed_dict, + 'position_attn_mask':position_attn_mask, + 'position_voxel_indices':position_voxel_indices, + 'mva_scale': mva_scale, + 'ref_scale': ref_scale, + } + else: + cross_attention_kwargs_ = { + 'mode': 'r', 'num_in_batch': N_gen, + 'condition_embed_dict': condition_embed_dict, + 'mva_scale': mva_scale, + 'ref_scale': ref_scale, + } + return self.unet( + sample, timestep, + encoder_hidden_states_gen, *args, + class_labels=camera_info_gen, + down_intrablock_additional_residuals=[ + sample.to(dtype=self.unet.dtype) for sample in down_intrablock_additional_residuals + ] if down_intrablock_additional_residuals is not None else None, + down_block_additional_residuals=[ + sample.to(dtype=self.unet.dtype) for sample in down_block_res_samples + ] if down_block_res_samples is not None else None, + mid_block_additional_residual=( + mid_block_res_sample.to(dtype=self.unet.dtype) + if mid_block_res_sample is not None else None + ), + return_dict=False, + cross_attention_kwargs=cross_attention_kwargs_, + ) diff --git a/hy3dgen/texgen/pipelines.py b/hy3dgen/texgen/pipelines.py new file mode 100644 index 0000000..508a971 --- /dev/null +++ b/hy3dgen/texgen/pipelines.py @@ -0,0 +1,239 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +import logging +import numpy as np +import os +import torch +from PIL import Image +from typing import List, Union, Optional + + +from .differentiable_renderer.mesh_render import MeshRender +from .utils.dehighlight_utils import Light_Shadow_Remover +from .utils.multiview_utils import Multiview_Diffusion_Net +from .utils.imagesuper_utils import Image_Super_Net +from .utils.uv_warp_utils import mesh_uv_wrap + +logger = logging.getLogger(__name__) + + +class Hunyuan3DTexGenConfig: + + def __init__(self, light_remover_ckpt_path, multiview_ckpt_path, subfolder_name): + self.device = 'cuda' + self.light_remover_ckpt_path = light_remover_ckpt_path + self.multiview_ckpt_path = multiview_ckpt_path + + self.candidate_camera_azims = [0, 90, 180, 270, 0, 180] + self.candidate_camera_elevs = [0, 0, 0, 0, 90, -90] + self.candidate_view_weights = [1, 0.1, 0.5, 0.1, 0.05, 0.05] + + self.render_size = 2048 + self.texture_size = 2048 + self.bake_exp = 4 + self.merge_method = 'fast' + + self.pipe_dict = {'hunyuan3d-paint-v2-0': 'hunyuanpaint', 'hunyuan3d-paint-v2-0-turbo': 'hunyuanpaint-turbo'} + self.pipe_name = self.pipe_dict[subfolder_name] + + +class Hunyuan3DPaintPipeline: + @classmethod + def from_pretrained(cls, model_path, subfolder='hunyuan3d-paint-v2-0-turbo'): + original_model_path = model_path + if not os.path.exists(model_path): + # try local path + base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen') + model_path = os.path.expanduser(os.path.join(base_dir, model_path)) + + delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0') + multiview_model_path = os.path.join(model_path, subfolder) + + if not os.path.exists(delight_model_path) or not os.path.exists(multiview_model_path): + try: + import huggingface_hub + # download from huggingface + model_path = huggingface_hub.snapshot_download( + repo_id=original_model_path, allow_patterns=["hunyuan3d-delight-v2-0/*"] + ) + model_path = huggingface_hub.snapshot_download( + repo_id=original_model_path, allow_patterns=[f'{subfolder}/*'] + ) + delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0') + multiview_model_path = os.path.join(model_path, subfolder) + return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path, subfolder)) + except ImportError: + logger.warning( + "You need to install HuggingFace Hub to load models from the hub." + ) + raise RuntimeError(f"Model path {model_path} not found") + else: + return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path, subfolder)) + + raise FileNotFoundError(f"Model path {original_model_path} not found and we could not find it at huggingface") + + def __init__(self, config): + self.config = config + self.models = {} + self.render = MeshRender( + default_resolution=self.config.render_size, + texture_size=self.config.texture_size) + + self.load_models() + + def load_models(self): + # empty cude cache + torch.cuda.empty_cache() + # Load model + self.models['delight_model'] = Light_Shadow_Remover(self.config) + self.models['multiview_model'] = Multiview_Diffusion_Net(self.config) + # self.models['super_model'] = Image_Super_Net(self.config) + + def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"): + self.models['delight_model'].pipeline.enable_model_cpu_offload(gpu_id=gpu_id, device=device) + self.models['multiview_model'].pipeline.enable_model_cpu_offload(gpu_id=gpu_id, device=device) + + def render_normal_multiview(self, camera_elevs, camera_azims, use_abs_coor=True): + normal_maps = [] + for elev, azim in zip(camera_elevs, camera_azims): + normal_map = self.render.render_normal( + elev, azim, use_abs_coor=use_abs_coor, return_type='pl') + normal_maps.append(normal_map) + + return normal_maps + + def render_position_multiview(self, camera_elevs, camera_azims): + position_maps = [] + for elev, azim in zip(camera_elevs, camera_azims): + position_map = self.render.render_position( + elev, azim, return_type='pl') + position_maps.append(position_map) + + return position_maps + + def bake_from_multiview(self, views, camera_elevs, + camera_azims, view_weights, method='graphcut'): + project_textures, project_weighted_cos_maps = [], [] + project_boundary_maps = [] + for view, camera_elev, camera_azim, weight in zip( + views, camera_elevs, camera_azims, view_weights): + project_texture, project_cos_map, project_boundary_map = self.render.back_project( + view, camera_elev, camera_azim) + project_cos_map = weight * (project_cos_map ** self.config.bake_exp) + project_textures.append(project_texture) + project_weighted_cos_maps.append(project_cos_map) + project_boundary_maps.append(project_boundary_map) + + if method == 'fast': + texture, ori_trust_map = self.render.fast_bake_texture( + project_textures, project_weighted_cos_maps) + else: + raise f'no method {method}' + return texture, ori_trust_map > 1E-8 + + def texture_inpaint(self, texture, mask): + + texture_np = self.render.uv_inpaint(texture, mask) + texture = torch.tensor(texture_np / 255).float().to(texture.device) + + return texture + + def recenter_image(self, image, border_ratio=0.2): + if image.mode == 'RGB': + return image + elif image.mode == 'L': + image = image.convert('RGB') + return image + + alpha_channel = np.array(image)[:, :, 3] + non_zero_indices = np.argwhere(alpha_channel > 0) + if non_zero_indices.size == 0: + raise ValueError("Image is fully transparent") + + min_row, min_col = non_zero_indices.min(axis=0) + max_row, max_col = non_zero_indices.max(axis=0) + + cropped_image = image.crop((min_col, min_row, max_col + 1, max_row + 1)) + + width, height = cropped_image.size + border_width = int(width * border_ratio) + border_height = int(height * border_ratio) + + new_width = width + 2 * border_width + new_height = height + 2 * border_height + + square_size = max(new_width, new_height) + + new_image = Image.new('RGBA', (square_size, square_size), (255, 255, 255, 0)) + + paste_x = (square_size - new_width) // 2 + border_width + paste_y = (square_size - new_height) // 2 + border_height + + new_image.paste(cropped_image, (paste_x, paste_y)) + return new_image + + @torch.no_grad() + def __call__(self, mesh, image): + + if not isinstance(image, List): + image = [image] + + images_prompt = [] + for i in range(len(image)): + if isinstance(image[i], str): + image_prompt = Image.open(image[i]) + else: + image_prompt = image[i] + images_prompt.append(image_prompt) + + images_prompt = [self.recenter_image(image_prompt) for image_prompt in images_prompt] + + images_prompt = [self.models['delight_model'](image_prompt) for image_prompt in images_prompt] + + mesh = mesh_uv_wrap(mesh) + + self.render.load_mesh(mesh) + + selected_camera_elevs, selected_camera_azims, selected_view_weights = \ + self.config.candidate_camera_elevs, self.config.candidate_camera_azims, self.config.candidate_view_weights + + normal_maps = self.render_normal_multiview( + selected_camera_elevs, selected_camera_azims, use_abs_coor=True) + position_maps = self.render_position_multiview( + selected_camera_elevs, selected_camera_azims) + + camera_info = [(((azim // 30) + 9) % 12) // {-20: 1, 0: 1, 20: 1, -90: 3, 90: 3}[ + elev] + {-20: 0, 0: 12, 20: 24, -90: 36, 90: 40}[elev] for azim, elev in + zip(selected_camera_azims, selected_camera_elevs)] + multiviews = self.models['multiview_model'](images_prompt, normal_maps + position_maps, camera_info) + + for i in range(len(multiviews)): + # multiviews[i] = self.models['super_model'](multiviews[i]) + multiviews[i] = multiviews[i].resize( + (self.config.render_size, self.config.render_size)) + + texture, mask = self.bake_from_multiview(multiviews, + selected_camera_elevs, selected_camera_azims, selected_view_weights, + method=self.config.merge_method) + + mask_np = (mask.squeeze(-1).cpu().numpy() * 255).astype(np.uint8) + + texture = self.texture_inpaint(texture, mask_np) + + self.render.set_texture(texture) + textured_mesh = self.render.save_mesh() + + return textured_mesh diff --git a/hy3dgen/texgen/utils/__init__.py b/hy3dgen/texgen/utils/__init__.py new file mode 100644 index 0000000..8bb2bf8 --- /dev/null +++ b/hy3dgen/texgen/utils/__init__.py @@ -0,0 +1,13 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. \ No newline at end of file diff --git a/hy3dgen/texgen/utils/alignImg4Tex_utils.py b/hy3dgen/texgen/utils/alignImg4Tex_utils.py new file mode 100644 index 0000000..34df204 --- /dev/null +++ b/hy3dgen/texgen/utils/alignImg4Tex_utils.py @@ -0,0 +1,121 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import torch +from diffusers import EulerAncestralDiscreteScheduler +from diffusers import StableDiffusionControlNetPipeline, StableDiffusionXLControlNetImg2ImgPipeline, ControlNetModel, \ + AutoencoderKL + + +class Img2img_Control_Ip_adapter: + def __init__(self, device): + controlnet = ControlNetModel.from_pretrained('lllyasviel/control_v11f1p_sd15_depth', torch_dtype=torch.float16, + variant="fp16", use_safetensors=True) + pipe = StableDiffusionControlNetPipeline.from_pretrained( + 'runwayml/stable-diffusion-v1-5', controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True + ) + pipe.load_ip_adapter('h94/IP-Adapter', subfolder="models", weight_name="ip-adapter-plus_sd15.safetensors") + pipe.set_ip_adapter_scale(0.7) + + pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) + # pipe.enable_model_cpu_offload() + self.pipe = pipe.to(device) + + def __call__( + self, + prompt, + control_image, + ip_adapter_image, + negative_prompt, + height=512, + width=512, + num_inference_steps=20, + guidance_scale=8.0, + controlnet_conditioning_scale=1.0, + output_type="pil", + **kwargs, + ): + results = self.pipe( + prompt=prompt, + negative_prompt=negative_prompt, + image=control_image, + ip_adapter_image=ip_adapter_image, + generator=torch.manual_seed(42), + seed=42, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + controlnet_conditioning_scale=controlnet_conditioning_scale, + strength=1, + # clip_skip=2, + height=height, + width=width, + output_type=output_type, + **kwargs, + ).images[0] + return results + + +################################################################ + +class HesModel: + def __init__(self, ): + controlnet_depth = ControlNetModel.from_pretrained( + 'diffusers/controlnet-depth-sdxl-1.0', + torch_dtype=torch.float16, + variant="fp16", + use_safetensors=True + ) + self.pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained( + 'stabilityai/stable-diffusion-xl-base-1.0', + torch_dtype=torch.float16, + variant="fp16", + controlnet=controlnet_depth, + use_safetensors=True, + ) + self.pipe.vae = AutoencoderKL.from_pretrained( + 'madebyollin/sdxl-vae-fp16-fix', + torch_dtype=torch.float16 + ) + + self.pipe.load_ip_adapter('h94/IP-Adapter', subfolder="sdxl_models", weight_name="ip-adapter_sdxl.safetensors") + self.pipe.set_ip_adapter_scale(0.7) + self.pipe.to("cuda") + + def __call__(self, + init_image, + control_image, + ip_adapter_image=None, + prompt='3D image', + negative_prompt='2D image', + seed=42, + strength=0.8, + num_inference_steps=40, + guidance_scale=7.5, + controlnet_conditioning_scale=0.5, + **kwargs + ): + image = self.pipe( + prompt=prompt, + image=init_image, + control_image=control_image, + ip_adapter_image=ip_adapter_image, + negative_prompt=negative_prompt, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + strength=strength, + controlnet_conditioning_scale=controlnet_conditioning_scale, + seed=seed, + **kwargs + ).images[0] + return image diff --git a/hy3dgen/texgen/utils/counter_utils.py b/hy3dgen/texgen/utils/counter_utils.py new file mode 100644 index 0000000..383a515 --- /dev/null +++ b/hy3dgen/texgen/utils/counter_utils.py @@ -0,0 +1,48 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + + +class RunningStats(): + def __init__(self) -> None: + self.count = 0 + self.sum = 0 + self.mean = 0 + self.min = None + self.max = None + + def add_value(self, value): + self.count += 1 + self.sum += value + self.mean = self.sum / self.count + + if self.min is None or value < self.min: + self.min = value + + if self.max is None or value > self.max: + self.max = value + + def get_count(self): + return self.count + + def get_sum(self): + return self.sum + + def get_mean(self): + return self.mean + + def get_min(self): + return self.min + + def get_max(self): + return self.max diff --git a/hy3dgen/texgen/utils/dehighlight_utils.py b/hy3dgen/texgen/utils/dehighlight_utils.py new file mode 100644 index 0000000..9b52368 --- /dev/null +++ b/hy3dgen/texgen/utils/dehighlight_utils.py @@ -0,0 +1,110 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import cv2 +import numpy as np +import torch +from PIL import Image +from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler + + +class Light_Shadow_Remover(): + def __init__(self, config): + self.device = config.device + self.cfg_image = 1.5 + self.cfg_text = 1.0 + + pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained( + config.light_remover_ckpt_path, + torch_dtype=torch.float16, + safety_checker=None, + ) + pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config) + pipeline.set_progress_bar_config(disable=True) + + self.pipeline = pipeline.to(self.device, torch.float16) + + def recorrect_rgb(self, src_image, target_image, alpha_channel, scale=0.95): + + def flat_and_mask(bgr, a): + mask = torch.where(a > 0.5, True, False) + bgr_flat = bgr.reshape(-1, bgr.shape[-1]) + mask_flat = mask.reshape(-1) + bgr_flat_masked = bgr_flat[mask_flat, :] + return bgr_flat_masked + + src_flat = flat_and_mask(src_image, alpha_channel) + target_flat = flat_and_mask(target_image, alpha_channel) + corrected_bgr = torch.zeros_like(src_image) + + for i in range(3): + src_mean, src_stddev = torch.mean(src_flat[:, i]), torch.std(src_flat[:, i]) + target_mean, target_stddev = torch.mean(target_flat[:, i]), torch.std(target_flat[:, i]) + corrected_bgr[:, :, i] = torch.clamp( + (src_image[:, :, i] - scale * src_mean) * + (target_stddev / src_stddev) + scale * target_mean, + 0, 1) + + src_mse = torch.mean((src_image - target_image) ** 2) + modify_mse = torch.mean((corrected_bgr - target_image) ** 2) + if src_mse < modify_mse: + corrected_bgr = torch.cat([src_image, alpha_channel], dim=-1) + else: + corrected_bgr = torch.cat([corrected_bgr, alpha_channel], dim=-1) + + return corrected_bgr + + @torch.no_grad() + def __call__(self, image): + + image = image.resize((512, 512)) + + if image.mode == 'RGBA': + image_array = np.array(image) + alpha_channel = image_array[:, :, 3] + erosion_size = 3 + kernel = np.ones((erosion_size, erosion_size), np.uint8) + alpha_channel = cv2.erode(alpha_channel, kernel, iterations=1) + image_array[alpha_channel == 0, :3] = 255 + image_array[:, :, 3] = alpha_channel + image = Image.fromarray(image_array) + + image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device) + alpha = image_tensor[:, :, 3:] + rgb_target = image_tensor[:, :, :3] + else: + image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device) + alpha = torch.ones_like(image_tensor)[:, :, :1] + rgb_target = image_tensor[:, :, :3] + + image = image.convert('RGB') + + image = self.pipeline( + prompt="", + image=image, + generator=torch.manual_seed(42), + height=512, + width=512, + num_inference_steps=50, + image_guidance_scale=self.cfg_image, + guidance_scale=self.cfg_text, + ).images[0] + + image_tensor = torch.tensor(np.array(image)/255.0).to(self.device) + rgb_src = image_tensor[:,:,:3] + image = self.recorrect_rgb(rgb_src, rgb_target, alpha) + image = image[:,:,:3]*image[:,:,3:] + torch.ones_like(image[:,:,:3])*(1.0-image[:,:,3:]) + image = Image.fromarray((image.cpu().numpy()*255).astype(np.uint8)) + + return image diff --git a/hy3dgen/texgen/utils/imagesuper_utils.py b/hy3dgen/texgen/utils/imagesuper_utils.py new file mode 100644 index 0000000..0b893c5 --- /dev/null +++ b/hy3dgen/texgen/utils/imagesuper_utils.py @@ -0,0 +1,34 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import torch +from diffusers import StableDiffusionUpscalePipeline + +class Image_Super_Net(): + def __init__(self, config): + self.up_pipeline_x4 = StableDiffusionUpscalePipeline.from_pretrained( + 'stabilityai/stable-diffusion-x4-upscaler', + torch_dtype=torch.float16, + ).to(config.device) + self.up_pipeline_x4.set_progress_bar_config(disable=True) + + def __call__(self, image, prompt=''): + with torch.no_grad(): + upscaled_image = self.up_pipeline_x4( + prompt=[prompt], + image=image, + num_inference_steps=5, + ).images[0] + + return upscaled_image diff --git a/hy3dgen/texgen/utils/multiview_utils.py b/hy3dgen/texgen/utils/multiview_utils.py new file mode 100644 index 0000000..4d6a6ba --- /dev/null +++ b/hy3dgen/texgen/utils/multiview_utils.py @@ -0,0 +1,87 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import os +import random + +import numpy as np +import torch +from typing import List +from diffusers import DiffusionPipeline +from diffusers import EulerAncestralDiscreteScheduler, LCMScheduler + + +class Multiview_Diffusion_Net(): + def __init__(self, config) -> None: + self.device = config.device + self.view_size = 512 + multiview_ckpt_path = config.multiview_ckpt_path + + current_file_path = os.path.abspath(__file__) + custom_pipeline_path = os.path.join(os.path.dirname(current_file_path), '..', 'hunyuanpaint') + + pipeline = DiffusionPipeline.from_pretrained( + multiview_ckpt_path, + custom_pipeline=custom_pipeline_path, torch_dtype=torch.float16) + + if config.pipe_name in ['hunyuanpaint']: + pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config, + timestep_spacing='trailing') + elif config.pipe_name in ['hunyuanpaint-turbo']: + pipeline.scheduler = LCMScheduler.from_config(pipeline.scheduler.config, + timestep_spacing='trailing') + pipeline.set_turbo(True) + # pipeline.prepare() + + pipeline.set_progress_bar_config(disable=True) + self.pipeline = pipeline.to(self.device) + + def seed_everything(self, seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + os.environ["PL_GLOBAL_SEED"] = str(seed) + + def __call__(self, input_images, control_images, camera_info): + + self.seed_everything(0) + + if not isinstance(input_images, List): + input_images = [input_images] + + input_images = [input_image.resize((self.view_size, self.view_size)) for input_image in input_images] + for i in range(len(control_images)): + control_images[i] = control_images[i].resize((self.view_size, self.view_size)) + if control_images[i].mode == 'L': + control_images[i] = control_images[i].point(lambda x: 255 if x > 1 else 0, mode='1') + + kwargs = dict(generator=torch.Generator(device=self.pipeline.device).manual_seed(0)) + + num_view = len(control_images) // 2 + normal_image = [[control_images[i] for i in range(num_view)]] + position_image = [[control_images[i + num_view] for i in range(num_view)]] + + camera_info_gen = [camera_info] + camera_info_ref = [[0]] + kwargs['width'] = self.view_size + kwargs['height'] = self.view_size + kwargs['num_in_batch'] = num_view + kwargs['camera_info_gen'] = camera_info_gen + kwargs['camera_info_ref'] = camera_info_ref + kwargs["normal_imgs"] = normal_image + kwargs["position_imgs"] = position_image + + mvd_image = self.pipeline(input_images, num_inference_steps=30, **kwargs).images + + return mvd_image diff --git a/hy3dgen/texgen/utils/simplify_mesh_utils.py b/hy3dgen/texgen/utils/simplify_mesh_utils.py new file mode 100644 index 0000000..5c23999 --- /dev/null +++ b/hy3dgen/texgen/utils/simplify_mesh_utils.py @@ -0,0 +1,36 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import trimesh + + +def remesh_mesh(mesh_path, remesh_path, method='trimesh'): + if method == 'trimesh': + mesh_simplify_trimesh(mesh_path, remesh_path) + else: + raise f'Method {method} has not been implemented.' + + +def mesh_simplify_trimesh(inputpath, outputpath): + import pymeshlab + ms = pymeshlab.MeshSet() + ms.load_new_mesh(inputpath, load_in_a_single_layer=True) + ms.save_current_mesh(outputpath.replace('.glb', '.obj'), save_textures=False) + + courent = trimesh.load(outputpath.replace('.glb', '.obj'), force='mesh') + face_num = courent.faces.shape[0] + + if face_num > 100000: + courent = courent.simplify_quadric_decimation(40000) + courent.export(outputpath) diff --git a/hy3dgen/texgen/utils/uv_warp_utils.py b/hy3dgen/texgen/utils/uv_warp_utils.py new file mode 100644 index 0000000..f55a924 --- /dev/null +++ b/hy3dgen/texgen/utils/uv_warp_utils.py @@ -0,0 +1,32 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import trimesh +import xatlas + + +def mesh_uv_wrap(mesh): + if isinstance(mesh, trimesh.Scene): + mesh = mesh.dump(concatenate=True) + + if len(mesh.faces) > 500000000: + raise ValueError("The mesh has more than 500,000,000 faces, which is not supported.") + + vmapping, indices, uvs = xatlas.parametrize(mesh.vertices, mesh.faces) + + mesh.vertices = mesh.vertices[vmapping] + mesh.faces = indices + mesh.visual.uv = uvs + + return mesh diff --git a/hy3dgen/text2image.py b/hy3dgen/text2image.py new file mode 100644 index 0000000..2c8a3ab --- /dev/null +++ b/hy3dgen/text2image.py @@ -0,0 +1,81 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +import os +import random + +import numpy as np +import torch +from diffusers import AutoPipelineForText2Image + + +def seed_everything(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + os.environ["PL_GLOBAL_SEED"] = str(seed) + + +class HunyuanDiTPipeline: + def __init__( + self, + model_path="Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled", + device='cuda' + ): + self.device = device + self.pipe = AutoPipelineForText2Image.from_pretrained( + model_path, + torch_dtype=torch.float16, + enable_pag=True, + pag_applied_layers=["blocks.(16|17|18|19)"] + ).to(device) + self.pos_txt = ",白色背景,3D风格,最佳质量" + self.neg_txt = "文本,特写,裁剪,出框,最差质量,低质量,JPEG伪影,PGLY,重复,病态," \ + "残缺,多余的手指,变异的手,画得不好的手,画得不好的脸,变异,畸形,模糊,脱水,糟糕的解剖学," \ + "糟糕的比例,多余的肢体,克隆的脸,毁容,恶心的比例,畸形的肢体,缺失的手臂,缺失的腿," \ + "额外的手臂,额外的腿,融合的手指,手指太多,长脖子" + + def compile(self): + # accelarate hunyuan-dit transformer,first inference will cost long time + torch.set_float32_matmul_precision('high') + self.pipe.transformer = torch.compile(self.pipe.transformer, fullgraph=True) + # self.pipe.vae.decode = torch.compile(self.pipe.vae.decode, fullgraph=True) + generator = torch.Generator(device=self.pipe.device) # infer once for hot-start + out_img = self.pipe( + prompt='美少女战士', + negative_prompt='模糊', + num_inference_steps=25, + pag_scale=1.3, + width=1024, + height=1024, + generator=generator, + return_dict=False + )[0][0] + + @torch.no_grad() + def __call__(self, prompt, seed=0): + seed_everything(seed) + generator = torch.Generator(device=self.pipe.device) + generator = generator.manual_seed(int(seed)) + out_img = self.pipe( + prompt=prompt[:60] + self.pos_txt, + negative_prompt=self.neg_txt, + num_inference_steps=25, + pag_scale=1.3, + width=1024, + height=1024, + generator=generator, + return_dict=False + )[0][0] + return out_img diff --git a/minimal_demo.py b/minimal_demo.py new file mode 100644 index 0000000..c268422 --- /dev/null +++ b/minimal_demo.py @@ -0,0 +1,33 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from PIL import Image + +from hy3dgen.rembg import BackgroundRemover +from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline +from hy3dgen.texgen import Hunyuan3DPaintPipeline + +model_path = 'tencent/Hunyuan3D-2' +pipeline_shapegen = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(model_path) +pipeline_texgen = Hunyuan3DPaintPipeline.from_pretrained(model_path) + +image_path = 'assets/demo.png' +image = Image.open(image_path).convert("RGBA") +if image.mode == 'RGB': + rembg = BackgroundRemover() + image = rembg(image) + +mesh = pipeline_shapegen(image=image)[0] +mesh = pipeline_texgen(mesh, image=image) +mesh.export('demo.glb') diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..abdab84 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,40 @@ +ninja +pybind11 + +diffusers +einops +opencv-python +numpy +torch +transformers +torchvision +#taming-transformers-rom1504 +#ConfigArgParse +#ipdb +omegaconf + +#sentencepiece +tqdm + +# Mesh Processing +trimesh +pymeshlab +pygltflib +xatlas +#kornia +#facexlib + +# Training +accelerate +#pytorch_lightning +#scikit-learn +#scikit-image + +# Demo only +gradio +fastapi +uvicorn +rembg +onnxruntime +#gevent +#geventhttpclient \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..5339da1 --- /dev/null +++ b/setup.py @@ -0,0 +1,46 @@ +# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT +# except for the third-party components listed below. +# Hunyuan 3D does not impose any additional limitations beyond what is outlined +# in the repsective licenses of these third-party components. +# Users must comply with all terms and conditions of original licenses of these third-party +# components and must ensure that the usage of the third party components adheres to +# all relevant laws and regulations. + +# For avoidance of doubts, Hunyuan 3D means the large language models and +# their software and algorithms, including trained model weights, parameters (including +# optimizer states), machine-learning model code, inference-enabling code, training-enabling code, +# fine-tuning enabling code and other elements of the foregoing made publicly available +# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT. + +from setuptools import setup, find_packages + +setup( + name="hy3dgen", + version="2.0.2", + url="https://github.com/Tencent/Hunyuan3D-2", + packages=find_packages(), + include_package_data=True, + package_data={"hy3dgen": ["assets/*", "assets/**/*"]}, + install_requires=[ + 'gradio', + "tqdm>=4.66.3", + 'numpy', + 'ninja', + 'diffusers', + 'pybind11', + 'opencv-python', + 'einops', + "transformers>=4.48.0", + 'omegaconf', + 'trimesh', + 'pymeshlab', + 'pygltflib', + 'xatlas', + 'accelerate', + 'gradio', + 'fastapi', + 'uvicorn', + 'rembg', + 'onnxruntime' + ] +)