augxlabs
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎Dockerfile
+1-1 b/‎Dockerfile
+1-1
diff --git a/‎README.md
+42-2 b/‎README.md
+42-2
diff --git a/‎docker-compose.yml
+3-1 b/‎docker-compose.yml
+3-1
diff --git a/‎main.py
+119-4 b/‎main.py
+119-4
@@ -102,6 +102,7 @@ celerybeat.pid
 *.sage.py
 
 # Environments
+default.env
 .env
 .venv
 env/
 
@@ -1,4 +1,4 @@
-FROM python:3.8
+FROM python:3.9
 WORKDIR /usr/app/src
 COPY requirements.txt ./
 RUN pip install --no-cache-dir -r requirements.txt
 
@@ -1,12 +1,52 @@
-# python-template
-Template to run python in a Docker container using docker-compose
+# assembly-ai
+Run Assembly AI in a Docker container using docker-compose
+https://www.assemblyai.com/docs/reference#transcript
 
 ## pre-requisite
 
 - [install docker](https://www.docker.com/get-started/)
+- create a `.env` file with your API key and content url
+
+```bash
+ASSEMBLY_AI_TOKEN=
+CONTENT_URL=https://augie-public-test.s3.amazonaws.com/89e5915c-cf8b-4f18-9b22-31255e4155cc/e2dc554f-740b-4973-a944-3d53046621a8/434d1c6a-7974-4749-9416-a92a3a3fe597.mp3
+```
 
 ## run
 
 ```sh
 docker-compose up --build
+```
+
+## results
+
+### paragraph
+
+```json
+[
+    {
+        "text": "Storytelling transporting an audience into your own imagination. For over 60,000 years, humans have been sharing stories with each other. From around the fire to the first cave paintings, to artwork and to interactive TV shows. We've evolved our ability to more richly engage our audiences. It's never been easier to record videos of current events, sports, arts, comedy, anything you can imagine.",
+        "start": 912,
+        "end": 25197,
+        "confidence": 0.90779,
+        "words": [
+            {
+                "text": "Storytelling",
+                "start": 912,
+                "end": 1917,
+                "confidence": 0.90779,
+                "speaker": null
+            },
+            {
+                "text": "transporting",
+                "start": 2052,
+                "end": 2727,
+                "confidence": 0.89105,
+                "speaker": null
+            },
+            ...
+            ...
+        ]
+    },
+]
 ```
@@ -2,6 +2,8 @@ version: "3.7"
 services:
   operator:
     build: .
-    image: python_template:0.1.0
+    image: assemgly_ai:0.1.0
+    env_file:
+      - default.env
     volumes: 
       - ${PWD}:/usr/app/src/
@@ -1,5 +1,120 @@
-def main():    
-    print("main")
-    
-if __name__ == "__main__":
+import requests
+import time
+import os
+import json
+
+endpoint = 'https://api.assemblyai.com/v2/transcript'
+
+# get token from environment variable
+api_token = os.environ.get('ASSEMBLY_AI_TOKEN')
+content_url = os.environ.get('CONTENT_URL')
+
+headers = {
+    'authorization': api_token,
+    'content-type': 'application/json'
+}
+transcript_endpoint = 'https://api.assemblyai.com/v2/transcript'
+
+def request_transcript():
+    transcript_request = {
+        'audio_url': content_url,
+        'filter_profanity': True,    # Profanity Filtering                  https://www.assemblyai.com/docs/core-transcription#profanity-filtering
+        'punctuate': True,           # Automate Punctuation and Casing      https://www.assemblyai.com/docs/core-transcription#automatic-punctuation-and-casing
+        'language_detection': True,  # Automatic Language Detection         https://www.assemblyai.com/docs/core-transcription#automatic-language-detection
+        'auto_highlights': True,     # Detect Important Phrases and Words   https://www.assemblyai.com/docs/audio-intelligence#detect-important-phrases-and-words
+        'content_safety': True,      # Content Moderation                   https://www.assemblyai.com/docs/audio-intelligence#content-moderation
+        'iab_categories': True,      # Topic Detection(IAB Categories)      https://www.assemblyai.com/docs/audio-intelligence#topic-detection-iab-classification
+        'sentiment_analysis': True,  # Sentiment Analysis                   https://www.assemblyai.com/docs/audio-intelligence#sentiment-analysis
+        'summary_type': 'bullets',   # Summary bullets                      https://www.assemblyai.com/docs/audio-intelligence#summarization
+        #'summary_type': 'gist',      # Summary gist
+        #'summary_type': 'headline',  # Summary headline
+        #'summary_type': 'paragraph', # Summary paragraph
+        'auto_chapters': True,       # Automatic Chapters                   https://www.assemblyai.com/docs/audio-intelligence#auto-chapters
+        'entity_detection': True,    # Entity Detection                     https://www.assemblyai.com/docs/audio-intelligence#entity-detection
+    }
+    transcript_response = requests.post(
+        transcript_endpoint,
+        json=transcript_request,
+        headers=headers
+    )
+    return transcript_response.json()    
+
+def make_polling_endpoint(transcript_response):
+    polling_endpoint = 'https://api.assemblyai.com/v2/transcript/'
+    polling_endpoint += transcript_response['id']
+    print('Polling endpoint: ' + polling_endpoint)
+    return polling_endpoint
+
+def wait_for_completion(polling_endpoint):
+    while True:
+        polling_response = requests.get(polling_endpoint, headers=headers)
+        polling_response = polling_response.json()
+
+        if polling_response['status'] == 'completed':
+            return polling_response
+
+        if polling_response['status'] == 'error':
+            print('Error: ' + polling_response['error'])
+            break
+
+        print ('Status: ' + polling_response['status'])
+        time.sleep(5)
+
+def get_paragraphs(polling_endpoint):
+    response = requests.get(polling_endpoint + '/paragraphs', headers=headers)
+    response = response.json()
+    data = []
+    for para in response['paragraphs']:
+        data.append(para)
+    return data
+
+def get_sentences(polling_endpoint):
+    response = requests.get(polling_endpoint + '/sentences', headers=headers)
+    response = response.json()
+    data = []
+    for para in response['sentences']:
+        data.append(para)
+    return data
+
+def main():
+    # save request transcript
+    transcript_response = request_transcript()
+    json_object = json.dumps(transcript_response, indent=4)    
+    with open('request.json', 'w') as outfile:
+        outfile.write(json_object)
+
+    # poll and wait
+    polling_endpoint = make_polling_endpoint(transcript_response)
+    transcription = wait_for_completion(polling_endpoint)
+
+    # save transcription
+    json_object = json.dumps(transcription, indent=4)    
+    with open('transcription.json', 'w') as outfile:
+        outfile.write(json_object)
+
+    # save paragraphs
+    paragraphs = get_paragraphs(polling_endpoint)    
+    json_object = json.dumps(paragraphs, indent=4)    
+    with open('paragraphs.json', 'w') as outfile:
+        outfile.write(json_object)
+
+    # save sentences
+    sentences = get_sentences(polling_endpoint)
+    json_object = json.dumps(sentences, indent=4)    
+    with open('sentences.json', 'w') as outfile:
+        outfile.write(json_object)
+
+    # save srt 
+    response = requests.get(polling_endpoint + '/srt', headers=headers)
+    response = response.text
+    with open('srt.txt', 'w') as outfile:
+        outfile.write(response)
+
+    # save vtt
+    response = requests.get(polling_endpoint + '/vtt', headers=headers)
+    response = response.text
+    with open('vtt.txt', 'w') as outfile:
+        outfile.write(response)
+
+if __name__ == '__main__':
     main()
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM python:3.8`
	`1`	`+FROM python:3.9`
`2`	`2`	`WORKDIR /usr/app/src`
`3`	`3`	`COPY requirements.txt ./`
`4`	`4`	`RUN pip install --no-cache-dir -r requirements.txt`